1 Introduction

This notebook describes the analysis and interpretation of Methyl-Seq data generated for three developmental timepoints: Stage1 (S1): 0-9.15 hrs Stage2 (S1): 11-12 hrs Stage3 (S1): 24-60 hrs

2 Methylation data processing

2.1 Quality check using FastQC

methyl_seq_raw=Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1.fq.gz # Example: Stage 1, Reads_1
echo "Stage1 raw reads_1:" $methyl_seq_raw
#./software/FastQC/fastqc $methyl_seq_raw Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1.fq.gz 
Stage1 raw reads_1: Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1.fq.gz

2.2 Reads trimming using TrimGalore

#Running on stage1 data
#~/software/TrimGalore-0.6.6/trim_galore --paired  Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1.fq.gz Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_2.fq.gz

echo "Stage1 clean reads: Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1.fq Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_2_val_2.fq"
Stage1 clean reads: Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1.fq Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_2_val_2.fq

2.3 Mapping reads to Phaw5.1 genome

Running bismark_genome_preparation

$phaw_genome=/drives/ssd1/wei/genome/phaw_sambaAsm.scaff_seqs_editedScafNames.fa
~/software/Bismark-0.22.3/bismark_genome_preparation --path_to_bowtie ~/software/bowtie2/bowtie2-2.4.2-sra-linux-x86_64/  
--verbose --bowtie2 $phaw_genome

Running bismark_mapping

#~/software/Bismark-0.22.3/bismark --bowtie2 --path_to_bowtie2 ~/software/bowtie2/bowtie2-2.4.2-sra-linux-x86_64/ 
#--samtools_path ~/software/samtools-1.17 --genome /drives/ssd1/wei/genome -N 1 -L 22 --score_min L,0,-0.6  -1 ./Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1.fq -2 ./Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_2_val_2.fq

echo "Stage1 mapped reads: Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1_bismark_bt2_pe.bam"
Stage1 mapped reads: Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1_bismark_bt2_pe.bam

2.4 Deduplication

#~/software/Bismark-0.22.3/deduplicate_bismark -p --bam Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1_bismark_bt2_pe.bam
echo "Stage1 deduplicated reads: Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1_bismark_bt2_pe.deduplicated.bam"
Stage1 deduplicated reads: Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1_bismark_bt2_pe.deduplicated.bam

2.5 Methylation calling

#~/software/Bismark-0.22.3/bismark_methylation_extractor -p --comprehensive --samtools_path ~/software/samtools-1.17 --bedGraph --scaffolds  --cytosine_report --genome_folder /drives/ssd1/wei/genome/ Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1_bismark_bt2_pe.deduplicated.bam
echo "Covered CpGs - Stage1"
head -5 /drives/ssd1/wei/emseq/map/nocut/S1/S1_CpGsum
Covered CpGs - Stage1
Scaffold_1_HRSCAF_22    9   +   0   0   0
Scaffold_1_HRSCAF_22    10  -   0   0   0
Scaffold_1_HRSCAF_22    17  +   0   0   0
Scaffold_1_HRSCAF_22    18  -   0   0   0
Scaffold_1_HRSCAF_22    24  +   0   0   0

3 Summarise methylation level per stage

3.1 CpG coverage for all CPGs in the genome across stages - Fig S13A

knitr::opts_chunk$set(message=F,warning=F,echo=T)
#Load libraries
library(tidyr)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(reshape2)
library(ggpubr)
library(colortools)
library(Rsubread)
library(tibble)
library(scales)
library(ggbreak) 
library(VennDiagram)
library("RColorBrewer")
library(gridExtra)
library(methylKit)
#Coverage for all CPGs in the genome
covAllCpG_9hr <- read.table("/drives/ssd1/wei/formanuel/S1_CpGsum",header = F) #Covered CpGs stage 0-9:15 hrs
covAllCpG_12hr <- read.table("/drives/ssd1/wei/formanuel/S2_CpGsum",header = F) #Covered CpGs stage 11:12 hrs
covAllCpG_24_60hr <- read.table("/drives/ssd1/wei/formanuel/S3_CpGsum",header = F) #Covered CpGs stage 24:60 hrs

#Merge coverage data
covAllCpG_allStages <- dplyr::bind_rows(list(covAllCpG_9hr,covAllCpG_12hr,covAllCpG_24_60hr), .id = 'stage') %>% 
  mutate(stage = recode(stage, '1' = 'emb_9hr', '2' = 'emb_12hr', '3' = 'emb_24_60hr')) %>%
  mutate(coverage = (V4 + V5))
colnames(covAllCpG_allStages) <- c("stage","Scaffold","pos","strand","mCpG_reads","UnmCpG_reads","methylation_level","coverage")

rm(covAllCpG_9hr,covAllCpG_12hr,covAllCpG_24_60hr)
covAllCpG_allStages %>% group_by(stage) %>% summarise(covMedian = median(coverage),covMean = mean(coverage))

covAllCpG_allStages_distribution <- covAllCpG_allStages  %>% mutate(stage=factor(stage, levels = c('emb_9hr','emb_12hr','emb_24_60hr'))) %>%
  filter(coverage <= 50) %>%
  mutate(coverage=factor(coverage, levels = c(seq(0,50,1)))) %>%
  ggplot(aes(x=coverage,fill=stage)) +  geom_bar(stat='count', position='dodge') +
  scale_x_discrete(labels=c(seq(0,50,10)),breaks =seq(0,50,10) ) + 
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) +
  scale_y_continuous(expand = c(0,0),labels=unit_format(unit = "M",scale=1e-6))+ ylab("Counts (million)") +xlab("WGBS-seq coverage") +
  #scale_x_discrete(labels= c("0","20","40","60","80","100")) +
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 13,angle =0,vjust = 0.65),
        axis.text.y = element_text(size = 13),legend.position=c(0.8,0.85),
        axis.title = element_text(size = 20),strip.text.x = element_text(size = 12)) 

#tiff("covAllCpG_allStages_distribution.tiff", width = 5, height = 4, units = "in",res = 400,compression = "lzw")
covAllCpG_allStages_distribution 

#dev.off()

3.2 Time-point specific fractional methylation levels - Fig 5A

#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")


#tiff("methyl_level_perStage_allCovCpG.tiff", width = 1.5, height = 3, units = "in",res = 400,compression = "lzw")
meth_allCovCpG_allStages_perStage

#dev.off()

3.3 Methylation level distribution for mCpGs (methylation >= 20) - Fig S13B

mCpg_methylDistribution <- covAllCpG_allStages %>% filter(coverage >= 5, methylation_level >= 20) %>% 
  mutate(stage=factor(stage, levels = c('emb_9hr','emb_12hr','emb_24_60hr'))) %>% 
  ggplot() + 
  geom_histogram(aes(x=methylation_level,y=..count..)) + facet_wrap(~stage) +
  scale_y_continuous(expand = c(0,0),labels=unit_format(unit = "M",scale=1e-6),limits = c(0,1400000))+ 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 13,angle =30,vjust = 0.65),
        axis.text.y = element_text(size = 13),legend.position=c(0.1,0.85),
        axis.title = element_text(size = 20),strip.text.x = element_text(size = 12)) 

#tiff("mCpg_methylDistribution.tiff", width = 10, height = 4, units = "in",res = 400,compression = "lzw")
mCpg_methylDistribution 
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#dev.off()

3.4 Methylation by genomic feature - Fig 5B and S13D

#Open with summarised methylation data per stage and feature
perFeature_MethStats <- read.table("../wgbs_analysis/wholegenome_te",header = T)
colnames(perFeature_MethStats) <- c("region","5XCov_CpGs","mCpGs","mCpGs_percent","methylation_level","stage","Group")
perFeature_MethStats
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
#Plot fraction of methylated CpGs per feature - Fig 13D
mCpG_percent_byFeature <- perFeature_MethStats %>% mutate(stage = recode(stage, 'S1' = '9 hrs', 'S2' = '11-12 hrs', 'S3' = '24_60 hrs')) %>%
  ggplot(aes(x=region , y = mCpGs_percent, fill=stage)) +
  geom_col(position = "dodge",colour = "black", size = 0.1) +
  facet_wrap(~ Group,scales="free_x") +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + scale_y_continuous(expand = c(0,0)) +
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position=c(0.1,0.85),legend.background = element_rect(fill='transparent'),
        axis.title = element_text(size = 14),#strip.text.x = element_text(size = 12),
        strip.background = element_blank(),
        strip.text.x = element_blank()) + xlab(NULL) + ylab("Percent of mCpG") 

#Plot fractional methylation per feature - Fig 5B 
mCpG_level_byFeature <- perFeature_MethStats %>% mutate(stage = recode(stage, 'S1' = '9 hrs', 'S2' = '11-12 hrs', 'S3' = '24_60 hrs')) %>%
  ggplot(aes(x=region , y = methylation_level, fill=stage)) +
  geom_col(position = "dodge",colour = "black", size = 0.1) +
  facet_wrap(~ Group,scales="free_x") +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position=c(0.1,0.85),legend.background = element_rect(fill='transparent'),
        axis.title = element_text(size = 14),  strip.background = element_blank(),
        strip.text.x = element_blank()) + xlab(NULL) + ylab("Methylation %") 

#tiff("mCpG_percent_byFeature.tiff", width = 5, height = 3, units = "in",res = 400,compression = "lzw")
mCpG_percent_byFeature #%>% filter(stage =="emb_9hr") %>% filter(methylation_level == "100" ) %>% dim()

#dev.off()

#tiff("mCpG_level_byFeature.tiff", width = 5, height = 3, units = "in",res = 400,compression = "lzw")
mCpG_level_byFeature #%>% filter(stage =="emb_9hr") %>% filter(methylation_level == "100" ) %>% dim()

#dev.off()

3.5 Distribution of covered CpGs across genomic features - Fig S13C

#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
perGeneFeat_MethStats_allStages <- perFeature_MethStats %>% mutate(stage = recode(stage, 'S1' = '9 hrs', 'S2' = '11-12 hrs', 'S3' = '24-60 hrs')) %>%
  filter(Group == "Gene",region != "Genebody") %>% tibble() %>% 
  ggplot(aes(x = stage, y = `5XCov_CpGs`,fill= region)) + geom_bar(stat='identity', position='fill') +
  scale_y_continuous(expand = c(0,0))+ ylab("Covered CpGs fraction") + xlab(NULL) + scale_fill_brewer(palette = "Set1")+
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 13,angle =30,vjust = 0.65),
        axis.text.y = element_text(size = 13),#legend.position=c(0.1,0.85),
        axis.title = element_text(size = 20),strip.text.x = element_text(size = 12)) 

perTEs_MethStats_allStages <-perFeature_MethStats %>% mutate(stage = recode(stage, 'S1' = '9 hrs', 'S2' = '11-12 hrs', 'S3' = '24-60 hrs')) %>%
  filter(Group == "TE",region != "TE") %>% tibble() %>% 
  ggplot(aes(x = stage, y = `5XCov_CpGs`,fill= region)) + geom_bar(stat='identity', position='fill') +
  scale_y_continuous(expand = c(0,0))+ ylab("Covered CpGs fraction") + xlab(NULL) + scale_fill_brewer(palette = "Dark2")+
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 13,angle =30,vjust = 0.55),
        axis.text.y = element_text(size = 13),#legend.position=c(0.1,0.85),
        axis.title = element_text(size = 20),strip.text.x = element_text(size = 12)) 

#tiff("MethStats_allStages.tiff", width = 6.5, height = 5, units = "in",res = 400,compression = "lzw")
ggarrange(perGeneFeat_MethStats_allStages,perTEs_MethStats_allStages,nrow = 1,ncol = 2)


#dev.off()

4 Dynamics of DNA Methylomes for Early Embryos

I aimed to identify differential methylated CpGs between stages and next interrogate if this changes are prevalent in specific genomic regions. Only included in the analysis CpGs with coverage >=5

4.1 Read the files to a methylRawList object: myobj

methylseq_dir <- "/drives/ssd1/wei/emseq/sum"
methylseq_files <- grep("CpG_report.txt_CG_methykit.txt2$",list.files(methylseq_dir,full.names = T),value=TRUE)

myobj=methRead(as.list(methylseq_files[3:6]),
           sample.id=list("s11h","s11h","s24h","s24h"),
           assembly="phaw51",
           treatment=c(0,0,1,1),
           context="CpG",mincov = 5)
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
|--------------------------------------------------|
|==================================================|
head(myobj[[4]])
methylRaw object with 6 rows
--------------
data part of methylRaw have 8 columns, expected 7 columns
--------------
sample.id: s24h 
assembly: phaw51 
context: CpG 
resolution: base 

Quality check

getMethylationStats(myobj[[1]],plot=TRUE,both.strands=FALSE)

getMethylationStats(myobj[[2]],plot=TRUE,both.strands=FALSE)

getMethylationStats(myobj[[3]],plot=TRUE,both.strands=FALSE)

getMethylationStats(myobj[[4]],plot=TRUE,both.strands=FALSE)

Merging samples into a single table

meth=methylKit::unite(myobj, destrand=FALSE)
uniting...
clusterSamples(meth, dist="correlation", method="ward", plot=TRUE)
The "ward" method has been renamed to "ward.D"; note new "ward.D2"

Call:
hclust(d = d, method = HCLUST.METHODS[hclust.method])

Cluster method   : ward.D 
Distance         : pearson 
Number of objects: 4 

pc=PCASamples(meth,obj.return = TRUE, adj.lim=c(1,1))

4.2 Filtering CpGs based on variation

Calculate per CpG variation

pm=percMethylation(meth) # get percent methylation matrix
mds=matrixStats::rowSds(pm) # calculate standard deviation of CpGs
head(meth[mds>5,])
methylBase object with 6 rows
--------------
--------------
sample.ids: s11h s11h s24h s24h 
destranded FALSE 
assembly: phaw51 
context: CpG 
treament: 0 0 1 1 
resolution: base 

Histogram of CpG variation

hist(mds,col="gainsboro",xlab="Std. dev. per CpG",breaks = 100)

Keep CpGs with standard deviations larger than 2% - Cluster samples

meth <- meth[mds>2,]
print(paste0("# of CpGs forn downstream analysis: ",nrow(meth)))
[1] "# of CpGs forn downstream analysis: 3418434"

4.3 Extracting differential methylated CpGs. 9hrs to 11 hrs transtion

Samples per conditions were pulled and Fisher’s exact test was applied.

getSampleID(meth)
[1] "s11h" "s11h" "s24h" "s24h"
pooled.meth=pool(meth,sample.ids=c("s11h","s24h"))
dm.pooledf=calculateDiffMeth(pooled.meth)
two groups detected:
 will calculate methylation difference as the difference of
treatment (group: 1) - control (group: 0)

 NOTE: performing 'fast.fisher' instead of 'F' for two groups testing.

Get differentially methylated bases/regions

# All sites
all.diff=getMethylDiff(dm.pooledf,difference=5,qvalue=0.05,type="all")

# get hyper-methylated
hyper=getMethylDiff(dm.pooledf,difference=5,qvalue=0.05,type="hyper")

# get hypo-methylated
hypo=getMethylDiff(dm.pooledf,difference=5,qvalue=0.05,type="hypo")

4.4 Differentially methylated Regions

Tile the genome with windows of 1000bp length and 1000 bp step-size

5 Gene body (+-2kb) methylation analysis - Fig 5C

Process and summarise methylation per window across genes Each gene was split into 100 windows, CpGs were mapped to each window and the average methylation per window was estimated. The same process was performed on 2kb upstream and downstream regions, which were split into 20 windows

Summarising gene body methylation per stage

tiles=tileMethylCounts(myobj,win.size=1000,step.size=1000)
#head(tiles[[1]],3)

Combine into single object

meth_byRegion=methylKit::unite(tiles, destrand=FALSE)
uniting...

Extracting differential methylated regions. 11hrs to 24hrs transtion Samples per conditions were pulled and Fisher’s exact test was applied.

getSampleID(meth_byRegion)
[1] "s11h" "s11h" "s24h" "s24h"
pooled.methRegion=pool(meth_byRegion,sample.ids=c("s11h","s9h"))
dm.pooledf_byRegion=calculateDiffMeth(pooled.methRegion)
two groups detected:
 will calculate methylation difference as the difference of
treatment (group: 1) - control (group: 0)

 NOTE: performing 'fast.fisher' instead of 'F' for two groups testing.
# All sites
all.diff=getMethylDiff(dm.pooledf_byRegion,difference=5,qvalue=0.05,type="all")
print(paste0("All diffMeth:",nrow(all.diff)))
[1] "All diffMeth:37748"
# get hyper-methylated
hyper=getMethylDiff(dm.pooledf_byRegion,difference=5,qvalue=0.05,type="hyper")
print(paste0("All diffMeth:",nrow(hyper)))
[1] "All diffMeth:10546"
# get hypo-methylated
hypo=getMethylDiff(dm.pooledf_byRegion,difference=5,qvalue=0.05,type="hypo")
print(paste0("All diffMeth:",nrow(hypo)))
[1] "All diffMeth:27202"

Intersect with genic and TE elements

#Call window-based methylation files per stage
s1_genebody_w100 <- read.table("/drives/ssd1/wei/genome/genebody/nostrand/S1_genebody_win100_5_sum", header = F, stringsAsFactors = F)
colnames(s1_genebody_w100) <- c("gene_id","window","cpg_count","methylation")
s2_genebody_w100 <- read.table("/drives/ssd1/wei/genome/genebody/nostrand/S2_genebody_win100_5_sum", header = F, stringsAsFactors = F)
colnames(s2_genebody_w100) <- c("gene_id","window","cpg_count","methylation")
s3_genebody_w100 <- read.table("/drives/ssd1/wei/genome/genebody/nostrand/S3_genebody_win100_5_sum", header = F, stringsAsFactors = F)
colnames(s3_genebody_w100) <- c("gene_id","window","cpg_count","methylation")

# Summarise files, estimating window-based average methylation across genes
s1_genebody_w100 <- s1_genebody_w100 %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s1_genebody_w100 <- s1_genebody_w100[,c("gene_id",paste0("w",seq(1:100)))]

s2_genebody_w100 <- s2_genebody_w100  %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s2_genebody_w100 <- s2_genebody_w100[,c("gene_id",paste0("w",seq(1:100)))]

s3_genebody_w100 <- s3_genebody_w100  %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s3_genebody_w100 <- s3_genebody_w100[,c("gene_id",paste0("w",seq(1:100)))]

# Final data
s1_genebody_w100 <- s1_genebody_w100 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s2_genebody_w100 <- s2_genebody_w100 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s3_genebody_w100 <- s3_genebody_w100 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s1_genebody_w100
s2_genebody_w100
s3_genebody_w100

Summarizing 2kb downstream gene region per stage

#Call window-based methylation files per stage
s1_down_20 <- read.table("/drives/ssd1/wei/genome/down/nostrand/S1_down2kb_win20_5_sum", header = F, stringsAsFactors = F)
colnames(s1_down_20) <- c("gene_id","window","cpg_count","methylation")
s2_down_20 <- read.table("/drives/ssd1/wei/genome/down/nostrand/S2_down2kb_win20_5_sum", header = F, stringsAsFactors = F)
colnames(s2_down_20) <- c("gene_id","window","cpg_count","methylation")
s3_down_20 <- read.table("/drives/ssd1/wei/genome/down/nostrand/S3_down2kb_win20_5_sum", header = F, stringsAsFactors = F)
colnames(s3_down_20) <- c("gene_id","window","cpg_count","methylation")

# Summarise files, estimating window-based average methylation across genes
s1_down_20 <- s1_down_20 %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s1_down_20 <- s1_down_20[,c("gene_id",paste0("w",seq(1:20)))]

s2_down_20 <- s2_down_20  %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s2_down_20 <- s2_down_20[,c("gene_id",paste0("w",seq(1:20)))]

s3_down_20 <- s3_down_20  %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s3_down_20 <- s3_down_20[,c("gene_id",paste0("w",seq(1:20)))]

# Final data
s1_down_20 <- s1_down_20 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s2_down_20 <- s2_down_20 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s3_down_20 <- s3_down_20 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s1_down_20
s2_down_20
s3_down_20

Summarizing 2kb upstream gene region per stage

#Call window-based methylation files per stage
s1_up_20 <- read.table("/drives/ssd1/wei/genome/up/nostrand/S1_up2kb_win20_5_sum", header = F, stringsAsFactors = F)
colnames(s1_up_20) <- c("gene_id","window","cpg_count","methylation")
s2_up_20 <- read.table("/drives/ssd1/wei/genome/up/nostrand/S2_up2kb_win20_5_sum", header = F, stringsAsFactors = F)
colnames(s2_up_20) <- c("gene_id","window","cpg_count","methylation")
s3_up_20 <- read.table("/drives/ssd1/wei/genome/up/nostrand/S3_up2kb_win20_5_sum", header = F, stringsAsFactors = F)
colnames(s3_up_20) <- c("gene_id","window","cpg_count","methylation")

# Summarise files, estimating window-based average methylation across genes
s1_up_20 <- s1_up_20 %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s1_up_20 <- s1_up_20[,c("gene_id",paste0("w",seq(1:20)))]

s2_up_20 <- s2_up_20  %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s2_up_20 <- s2_up_20[,c("gene_id",paste0("w",seq(1:20)))]

s3_up_20 <- s3_up_20  %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s3_up_20 <- s3_up_20[,c("gene_id",paste0("w",seq(1:20)))]

# Final data
s1_up_20 <- s1_up_20 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s2_up_20 <- s2_up_20 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s3_up_20 <- s3_up_20 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s1_up_20
s2_up_20
s3_up_20

Merge upstream, gene_body and downstream average methylation data

up_gene_down_meth <- rbind(
cbind(s1_up_20,s1_genebody_w100,s1_down_20),
cbind(s2_up_20,s2_genebody_w100,s2_down_20),
cbind(s3_up_20,s3_genebody_w100,s3_down_20)) #%>% mutate(stage= c("S1","S2","S3"))
colnames(up_gene_down_meth) <- paste0("w",seq(1:140))
up_gene_down_meth <- up_gene_down_meth %>% dplyr::mutate(stage= c("S1","S2","S3")) %>% pivot_longer(-stage)
up_gene_down_meth

Plot

#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
geneBody_upDown_meth_plot <-up_gene_down_meth %>% 
  mutate(stage=factor(stage, levels= c("S1","S2","S3")),
         name=factor(name,levels =paste0("w",seq(1:140)))) %>%
  ggplot(aes(y=value ,x=name, group=stage,color=stage)) + geom_line(size=0.8)  +
  xlab("") +  ylab("Methylation (%)") +
  ggtitle("") +
  scale_x_discrete(breaks = c('w8','w20','w70','w120','w135'), 
                     labels=c("Upstream","TSS","Gene body","TES","Downstream")) +
  scale_color_manual(values = c("orange2","mistyrose4","maroon"),
                     breaks = c("S1","S2","S3"),
                     labels = c("0-9 hrs","11-12 hrs","24-60 hrs")) + theme_bw() +
  theme(panel.grid.major = element_line(colour=NA),
        panel.background = element_rect(fill= "transparent", colour=NA),
        plot.background = element_rect(fill = "transparent",colour = NA),
        panel.grid.minor = element_blank(),
        legend.position = c(0.87,0.65),
        legend.title = element_blank(),
        legend.key.size = unit(15,"pt"),
        legend.text=element_text(size=8),
        axis.text.y = element_text(size = 10),axis.text.x = element_text(size = 10,vjust = 1.15,angle=30,hjus=1.2),
        axis.title.x = element_text(size = 14),axis.title.y = element_text(size = 14))

#tiff("geneBody_upDown_meth.tiff", width = 4, height = 2.5, units = "in",res = 400,compression = "lzw")
geneBody_upDown_meth_plot

#dev.off()

6 Exon-intron methylation level - Fig 5D

Each exon was split in 30 windows, while each intron was split in 80 windows. For each windows, the average CpG methylation was estimated. The graph represent the average methylation per window across genes, using only the first four exons and introns.

Exon methylation

#Call window-based methylation files per stage
s1_exon_30 <- read.table("/drives/ssd1/wei/genome/exon/nostrand/S1_exon_modify_win30_5_sum", header = F, stringsAsFactors = F)
colnames(s1_exon_30) <- c("gene_id","exon_n","window","cpg_count","methylation")
s2_exon_30 <- read.table("/drives/ssd1/wei/genome/exon/nostrand/S2_exon_modify_win30_5_sum", header = F, stringsAsFactors = F)
colnames(s2_exon_30) <- c("gene_id","exon_n","window","cpg_count","methylation")
s3_exon_30 <- read.table("/drives/ssd1/wei/genome/exon/nostrand/S3_exon_modify_win30_5_sum", header = F, stringsAsFactors = F)
colnames(s3_exon_30) <- c("gene_id","exon_n","window","cpg_count","methylation")
s1_exon_30 %>% head()
#Summarised intron methylation - Stage1
exon_methy_list_s1 = list()
for (i in 1:4) { # Only extract data for first 4 introns
  temp_df <- s1_exon_30 %>% mutate(window=paste0("w",window)) %>% filter(exon_n == i) %>% 
    pivot_wider(- cpg_count,names_from = window,values_from = methylation) %>% select(-exon_n)
  temp_df <- temp_df[,c("gene_id",paste0("w",seq(1:30)))]
  temp_df <- temp_df %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)

exon_methy_list_s1[[i]] <- temp_df
}

#Summarised intron methylation - Stage2
exon_methy_list_s2 = list()
for (i in 1:4) { # Only extract data for first 4 introns
  temp_df <- s2_exon_30 %>% mutate(window=paste0("w",window)) %>% filter(exon_n == i) %>% 
    pivot_wider(- cpg_count,names_from = window,values_from = methylation) %>% select(-exon_n)
  temp_df <- temp_df[,c("gene_id",paste0("w",seq(1:30)))]
  temp_df <- temp_df %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)

exon_methy_list_s2[[i]] <- temp_df
}

#Summarised intron methylation - Stage2
exon_methy_list_s3 = list()
for (i in 1:4) { # Only extract data for first 4 introns
  temp_df <- s3_exon_30 %>% mutate(window=paste0("w",window)) %>% filter(exon_n == i) %>% 
    pivot_wider(- cpg_count,names_from = window,values_from = methylation) %>% select(-exon_n)
  temp_df <- temp_df[,c("gene_id",paste0("w",seq(1:30)))]
  temp_df <- temp_df %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)

exon_methy_list_s3[[i]] <- temp_df
}
exon_methy_list_s1
[[1]]

[[2]]

[[3]]

[[4]]
NA

Intron methylation

#Call window-based methylation files per stage
s1_intron_80 <- read.table("/drives/ssd1/wei/genome/intron/nostrand/S1_intron_modify_win80_5_sum", header = F, stringsAsFactors = F)
colnames(s1_intron_80) <- c("gene_id","intron_n","window","cpg_count","methylation")
s2_intron_80 <- read.table("/drives/ssd1/wei/genome/intron/nostrand/S2_intron_modify_win80_5_sum", header = F, stringsAsFactors = F)
colnames(s2_intron_80) <- c("gene_id","intron_n","window","cpg_count","methylation")
s3_intron_80 <- read.table("/drives/ssd1/wei/genome/intron/nostrand/S3_intron_modify_win80_5_sum", header = F, stringsAsFactors = F)
colnames(s3_intron_80) <- c("gene_id","intron_n","window","cpg_count","methylation")
s1_intron_80 %>% head()
#Summarised intron methylation - Stage1
intron_methy_list_s1 = list()
for (i in 1:4) { # Only extract data for first 4 introns
  temp_df <- s1_intron_80 %>% mutate(window=paste0("w",window)) %>% filter(intron_n == i) %>% 
    pivot_wider(- cpg_count,names_from = window,values_from = methylation) %>% select(-intron_n)
  temp_df <- temp_df[,c("gene_id",paste0("w",seq(1:80)))]
  temp_df <- temp_df %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)

intron_methy_list_s1[[i]] <- temp_df
}

#Summarised intron methylation - Stage2
intron_methy_list_s2 = list()
for (i in 1:4) { # Only extract data for first 4 introns
  temp_df <- s2_intron_80 %>% mutate(window=paste0("w",window)) %>% filter(intron_n == i) %>% 
    pivot_wider(- cpg_count,names_from = window,values_from = methylation) %>% select(-intron_n)
  temp_df <- temp_df[,c("gene_id",paste0("w",seq(1:80)))]
  temp_df <- temp_df %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)

intron_methy_list_s2[[i]] <- temp_df
}

#Summarised intron methylation - Stage2
intron_methy_list_s3 = list()
for (i in 1:4) { # Only extract data for first 4 introns
  temp_df <- s3_intron_80 %>% mutate(window=paste0("w",window)) %>% filter(intron_n == i) %>% 
    pivot_wider(- cpg_count,names_from = window,values_from = methylation) %>% select(-intron_n)
  temp_df <- temp_df[,c("gene_id",paste0("w",seq(1:80)))]
  temp_df <- temp_df %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)

intron_methy_list_s3[[i]] <- temp_df
}
intron_methy_list_s1
[[1]]

[[2]]

[[3]]

[[4]]
NA

Merge exon and intron methylation data

exon_intron_meth <- rbind(cbind(exon_methy_list_s1[[1]],intron_methy_list_s1[[1]],exon_methy_list_s1[[2]],intron_methy_list_s1[[2]],
                          exon_methy_list_s1[3],intron_methy_list_s1[[3]],exon_methy_list_s1[[4]],intron_methy_list_s1[[4]]),
                    cbind(exon_methy_list_s2[[1]],intron_methy_list_s2[[1]],exon_methy_list_s2[[2]],intron_methy_list_s2[[2]],
                          exon_methy_list_s2[3],intron_methy_list_s2[[3]],exon_methy_list_s2[[4]],intron_methy_list_s2[[4]]),
                    cbind(exon_methy_list_s3[[1]],intron_methy_list_s3[[1]],exon_methy_list_s3[[2]],intron_methy_list_s3[[2]],
                          exon_methy_list_s3[3],intron_methy_list_s3[[3]],exon_methy_list_s3[[4]],intron_methy_list_s3[[4]]))

colnames(exon_intron_meth) <- paste0("w",seq(1:440))
exon_intron_meth <- exon_intron_meth %>% dplyr::mutate(stage= c("S1","S2","S3")) %>% pivot_longer(-stage)
exon_intron_meth
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
exonIntron_meth_plot <- exon_intron_meth %>% 
  mutate(stage=factor(stage, levels= c("S1","S2","S3")),
         name=factor(name,levels =paste0("w",seq(1:440)))) %>%
  ggplot(aes(y=value ,x=name, group=stage,color=stage)) + geom_line(size=0.8)  +
  xlab("") +  ylab("Methylation (%)") + ggtitle(NULL) +
  scale_x_discrete(breaks = paste0('w',c(1,15,30,70,110,125,140,180,220,235,250,290,330,345,360,400,440)), 
                     labels=c(" ","Exon1"," ","Intron1"," ","Exon2"," ","Intron2"," ","Exon3"," ","Intron3"," ","Exon4"," ","Intron4"," ")) +
  scale_color_manual(values = c("orange2","mistyrose4","maroon"),
                     breaks = c("S1","S2","S3"),
                     labels = c("0-9 hrs","11-12 hrs","24-60 hrs")) + theme_bw() +
  theme(panel.grid.major = element_line(colour=NA),
        panel.background = element_rect(fill= "transparent", colour=NA),
        plot.background = element_rect(fill = "transparent",colour = NA),
        panel.grid.minor = element_blank(), legend.position = c(0.87,0.65), legend.title = element_blank(),
        legend.key.size = unit(12,"pt"), legend.text=element_text(size=7),
        axis.text.y = element_text(size = 10),axis.text.x = element_text(size = 8,vjust = 0.5),
        axis.title.x = element_text(size = 14),axis.title.y = element_text(size = 14))

#tiff("exonIntron_meth.tiff", width = 4, height = 2, units = "in",res = 400,compression = "lzw")
exonIntron_meth_plot

#dev.off()

7 Analysis of covered Cpgs, %of mCpg and methylation level for whole gene length and first 2500 bp

Load data

#CpGs covered along full gene body - Number and methylation average per gene
s1_covCpG_count <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s1_covCpG_count",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s2_covCpG_count <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s2_covCpG_count",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s3_covCpG_count <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s3_covCpG_count",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')

#mCpGs along full gene body - Number and methylation average per gene

s1_mCpG_count <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s1_mCpG_count",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s2_mCpG_count <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s2_mCpG_count",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s3_mCpG_count<-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s3_mCpG_count",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')

#CpGs covered along 5' 2500 bp of each gene - Number and methylation average per gene
s1_covCpG_count_first2500bp <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s1_covCpG_count_first2500bp",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s2_covCpG_count_first2500bp <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s2_covCpG_count_first2500bp",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s3_covCpG_count_first2500bp <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s3_covCpG_count_first2500bp",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')

#mCpGs along 5' 2500 bp of each gene - Number and methylation average per gene

s1_mCpG_count_first2500bp <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s1_mCpG_count_first2500bp",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s2_mCpG_count_first2500bp <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s2_mCpG_count_first2500bp",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s3_mCpG_count_first2500bp <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s3_mCpG_count_first2500bp",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')

Merge and summarise whole gene body methylation data

geneBody_full_methData <- dplyr::bind_rows(list(s1_covCpG_count,s2_covCpG_count,s3_covCpG_count,s1_mCpG_count,s2_mCpG_count,s3_mCpG_count), .id = 'class') %>%
  mutate(class = recode(class, '1' = 'Cov9hrs', '2' = 'Cov12hrs', '3' = 'Cov24hrs',
                        '4' = 'mCpG9hrs', '5' = 'mCpG12hrs', '6' = 'mCpG24hrs'))
colnames(geneBody_full_methData) <- c("class","gene","group","strand","CpG_N","methylation")
geneBody_full_methData %>% head()

Estimate percentage of gene body methylated CpGs (mCpG) - Fig 5E left

knitr::opts_chunk$set(message=F,warning=F,echo=T)
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
geneBody_mcpg_perc <- geneBody_full_methData %>% select(-c(group,strand)) %>%
  pivot_wider(names_from=class,values_from= c(CpG_N,methylation)) %>% 
  mutate(mCpG_perc9hr= (CpG_N_mCpG9hrs/CpG_N_Cov9hrs)*100,
         mCpG_perc12hr= (CpG_N_mCpG12hrs/CpG_N_Cov12hrs)*100,
         mCpG_perc24hr= (CpG_N_mCpG24hrs/CpG_N_Cov24hrs)*100) %>% select(gene,mCpG_perc9hr,mCpG_perc12hr,mCpG_perc24hr) %>%
  melt(id.vars="gene",value.name = "mCpG_perc") %>% 
  ggplot(aes(x=variable,y=mCpG_perc,fill=variable)) + geom_boxplot(outlier.colour = "grey",size=0.4) +
  scale_x_discrete(labels= c("9 hrs","11-12 hrs","24-60 hrs")) +
  scale_y_break(c(40, 95), scales = 0.1, ticklabels=c(95, 100)) + ylab("Gene body mCpG %") +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position="none",
        axis.text.x.top = element_blank(),
        axis.ticks.x.top = element_blank(),
        axis.line.x.top = element_blank(),
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12)) + xlab(NULL) + ylab(NULL)

suppressWarnings(grid.arrange(geneBody_mcpg_perc))


#tiff("geneBody_mcpg_perc.tiff", width = 1.5, height = 3, units = "in",res = 400,compression = "lzw")
#geneBody_mcpg_per
#dev.off()

Estimate gene body methylation level - Fig 5F left

#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")

geneBody_methLevel_all <- geneBody_full_methData %>% select(-c(group,strand)) %>% #sample_n(6000) %>%
  pivot_wider(names_from=class,values_from= c(CpG_N,methylation)) %>% 
  select(gene,methylation_Cov9hrs:methylation_Cov24hrs) %>%
  melt(id.vars="gene",value.name = "Methylation") %>% #tibble() %>% 
  ggplot(aes(x=variable,y=as.numeric(Methylation),fill=variable)) + geom_boxplot() +
  scale_x_discrete(labels= c("9 hrs","11-12 hrs","24-60 hrs")) +  
  scale_y_break(c(30, 95), scales = 0.1, ticklabels=c(95, 100)) + #scale_x_break(c(40,90), scales=2) +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position="none",
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),
        axis.ticks.x.top = element_blank(),
        axis.line.x.top = element_blank()) + xlab(NULL) + ylab("Gene body methylation %")

suppressWarnings(grid.arrange(geneBody_methLevel_all))

#tiff("geneBody_methLevel_all.tiff", width = 1.5, height = 3, units = "in",res = 400,compression = "lzw")
#geneBody_methLevel_all
#dev.off()

Merge and summarise gene body (first 2500 bp) methylation data

geneBody_up2500_methData <- dplyr::bind_rows(list(s1_covCpG_count_first2500bp,s2_covCpG_count_first2500bp,s3_covCpG_count_first2500bp,
                                                  s1_mCpG_count_first2500bp,s2_mCpG_count_first2500bp,s3_mCpG_count_first2500bp), .id = 'class') %>% 
  mutate(class = recode(class, '1' = 'Cov9hrs', '2' = 'Cov12hrs', '3' = 'Cov24hrs',
                        '4' = 'mCpG9hrs', '5' = 'mCpG12hrs', '6' = 'mCpG24hrs'))
colnames(geneBody_up2500_methData) <- c("class","gene","group","strand","CpG_N","methylation")
geneBody_up2500_methData %>% head()

Estimate percentage of gene body (first 2500 bp) methylated CpGs (mCpG) - Fig 5E right

#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")

geneBody2500_mcpg_perc <- geneBody_up2500_methData %>%  select(-c(group,strand)) %>%
  pivot_wider(names_from=class,values_from= c(CpG_N,methylation)) %>% 
  mutate(mCpG_perc9hr= (CpG_N_mCpG9hrs/CpG_N_Cov9hrs)*100,
         mCpG_perc12hr= (CpG_N_mCpG12hrs/CpG_N_Cov12hrs)*100,
         mCpG_perc24hr= (CpG_N_mCpG24hrs/CpG_N_Cov24hrs)*100) %>% select(gene,mCpG_perc9hr,mCpG_perc12hr,mCpG_perc24hr) %>%
  melt(id.vars="gene",value.name = "mCpG_perc") %>% 
  ggplot(aes(x=variable,y=mCpG_perc,fill=variable)) + geom_boxplot(outlier.colour = "grey",size=0.4) +
  scale_x_discrete(labels= c("9 hrs","11-12 hrs","24-60 hrs")) +
  scale_y_break(c(40, 95), scales = 0.1, ticklabels=c(95, 100)) + #scale_x_break(c(40,90), scales=2) +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position="none",
        axis.text.x.top = element_blank(),
        axis.ticks.x.top = element_blank(),
        axis.line.x.top = element_blank(),
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12)) + xlab(NULL) + ylab(NULL)

suppressWarnings(grid.arrange(geneBody2500_mcpg_perc))


#tiff("geneBody2500_mcpg_perc.tiff", width = 1.5, height = 3, units = "in",res = 400,compression = "lzw")
#geneBody2500_mcpg_perc
#dev.off()

Estimate gene body (first 2500 bp) methylation level - Fig 5F right

#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")

geneBody2500_methLevel_all <- geneBody_up2500_methData %>% select(-c(group,strand)) %>% #sample_n(6000) %>%
  pivot_wider(names_from=class,values_from= c(CpG_N,methylation)) %>% 
  select(gene,methylation_Cov9hrs:methylation_Cov24hrs) %>%
  melt(id.vars="gene",value.name = "Methylation") %>% #tibble() %>% 
  ggplot(aes(x=variable,y=as.numeric(Methylation),fill=variable)) + geom_boxplot() +
  scale_x_discrete(labels= c("9 hrs","11-12 hrs","24-60 hrs")) +  
  scale_y_break(c(30, 95), scales = 0.1, ticklabels=c(95, 100)) + #scale_x_break(c(40,90), scales=2) +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position="none",
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),
        axis.ticks.x.top = element_blank(),
        axis.line.x.top = element_blank()) + xlab(NULL) +  ylab("Gene body methylation %")

suppressWarnings(grid.arrange(geneBody2500_methLevel_all))


#tiff("geneBody2500_methLevel_all.tiff", width = 1.5, height = 3, units = "in",res = 400,compression = "lzw")
#geneBody2500_methLevel_all
#dev.off()

Covered CpGs per gene - Fig S13E

#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
geneBody_Cov_N_all_density <- geneBody_full_methData %>% select(-c(group,strand)) %>% 
  filter(grepl('Cov',class)) %>% mutate(class=gsub("Cov","",class)) %>% mutate(class= factor(class,levels = c("9hrs","12hrs","24hrs"))) %>%
  ggplot(aes(x=as.numeric(CpG_N),fill= `class`)) + geom_density(alpha=0.5) + xlim(NA,500) + scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon"),labels= c("9 hrs","11-12 hrs","24-60 hrs")) +
  geom_vline(aes(xintercept = 10), color="chocolate4", linetype="dashed", size=0.5) + theme_bw() +
  theme(legend.title=element_blank(),legend.background = element_rect(fill='transparent'),
        axis.text.x = element_text(size = 10,angle =0,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position=c(0.85,0.8),
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),
        axis.ticks.x.top = element_blank(),
        axis.line.x.top = element_blank()) + ylab("Density") + xlab("Covered CpGs per gene")

#tiff("geneBody_Cov_N_all_density.tiff", width = 3.5, height = 3, units = "in",res = 400,compression = "lzw")
geneBody_Cov_N_all_density
Warning: Removed 39023 rows containing non-finite values (stat_density).

#dev.off()

Pattern of gene methylation per stage - Fig 5G

#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
geneBody_methLevel_all_density <- geneBody_full_methData %>% select(-c(group,strand)) %>% 
  filter(grepl('Cov',class)) %>% mutate(class=gsub("Cov","",class)) %>% mutate(class= factor(class,levels = c("9hrs","12hrs","24hrs"))) %>%
  ggplot(aes(x=as.numeric(methylation),fill= `class`)) + geom_density(alpha=0.5) +
  scale_x_continuous(trans = log10_trans(),
                     breaks = c(0,0.01,0.1,1,3,10,100),
                     labels = label_number(accuracy = 1)) + scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon"),labels= c("9 hrs","11-12 hrs","24-60 hrs")) +
  geom_vline(aes(xintercept = 3), color="chocolate4", linetype="dashed", size=0.5) + theme_bw() +
  theme(legend.title=element_blank(),legend.background = element_rect(fill='transparent'),
        axis.text.x = element_text(size = 10,angle =0,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position=c(0.15,0.8),
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),
        axis.ticks.x.top = element_blank(),
        axis.line.x.top = element_blank()) + ylab("Density") + xlab("Gene methylation %")


#tiff("geneBody_methLevel_all_density.tiff", width = 3.5, height = 3, units = "in",res = 400,compression = "lzw")
geneBody_methLevel_all_density
Warning: Transformation introduced infinite values in continuous x-axis
Warning: Removed 4791 rows containing non-finite values (stat_density).

#dev.off()

8 Identify and quantify methylated genes per stage - Fig 5H

setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")

methylatedGenes_9hrs <- geneBody_full_methData %>% filter(class == "Cov9hrs" & CpG_N >= 5 & methylation >= 3) %>% pull(gene) 
methylatedGenes_12hrs <- geneBody_full_methData %>% filter(class == "Cov12hrs" & CpG_N >= 5 & methylation >= 3) %>% pull(gene)
methylatedGenes_24hrs <- geneBody_full_methData %>% filter(class == "Cov24hrs" & CpG_N >= 5 & methylation >= 3) %>% pull(gene)

Reduce(intersect,list(methylatedGenes_9hrs,methylatedGenes_12hrs,methylatedGenes_24hrs)) %>% unique() %>% length()

#Plot venn diagram intersecting methylated genes among stages

venn.diagram(
  x = list(methylatedGenes_9hrs,methylatedGenes_12hrs,methylatedGenes_24hrs),
  category.names = c("9 hrs" , "11-12 hrs","24-60 hrs"),
  filename = 'methylatedGenes_overlapp.tiff',
  height = 4, width = 4, resolution = 300, imagetype = "tiff", 
  units = "in", compression = "lzw", fill= c("orange2","mistyrose4","maroon" ),
  output=T, scaled=T,print.mode=c("raw","percent"))
  

Visualise venn diagrame in the notebook

plt = venn.diagram(
  x = list(methylatedGenes_9hrs,methylatedGenes_12hrs,methylatedGenes_24hrs),
  category.names = c("9 hrs" , "11-12 hrs","24-60 hrs"),
  filename = NULL,
  height = 4, width = 4, resolution = 300, imagetype = "tiff", 
  units = "in", compression = "lzw", fill= c("orange2","mistyrose4","maroon" ),
  output=T, scaled=T,print.mode=c("raw","percent"))
grid.newpage()
grid::grid.draw(plt)

9 Relationship between gene methylation and expression levels

For each methylome dataset (0-9,11-12 and 24-60 hrs) we analysed the expression levels in three subsequent embryonic stages. Expression was categorised as absent (TPM = 0), low(TPM <=1), moderate (TPM <=5) and high (TPM >=5)

Merge Methyl and expression data

embryo_exonCounts_polyA_tpm <- read.table("/drives/ssd1/manuel/phaw/2022_analysis/mzt_analysis/embryo_exonCounts_polyA_tpm.txt",header = T,stringsAsFactors = F)
colnames(embryo_exonCounts_polyA_tpm) <- c("gene_id","0-9hrs","11hrs","12hrs","14hrs","16hrs","20hrs","24hrs","24-60hrs")

embryo_Exp_Methyl_data <- merge(embryo_exonCounts_polyA_tpm,
geneBody_full_methData %>% select(-c(group,strand)) %>% 
  filter(grepl('Cov',class)) %>% mutate(class=gsub("Cov","",class)) %>% 
  mutate(class= factor(class,levels = c("9hrs","12hrs","24hrs"))) %>% 
  pivot_wider(names_from=class,values_from= c(CpG_N,methylation)),by.x = "gene_id",by.y = "gene")

embryo_Exp_Methyl_data

Gene expression at 11, 12 and 13-14 hrs embryos based on methylation from 0-9:15 hrs embryos - Fig 5I, left

get_my_colors <- colorRampPalette(brewer.pal(6, "Dark2"))
mycolors <- get_my_colors(8)

# Correlate 9hrs methylation with all expression timepoints - Boxplots
ExpMethyl_cor_9hrs <- embryo_Exp_Methyl_data %>% select(-c(`CpG_N_9hrs`:`CpG_N_24hrs`,`methylation_12hrs`,`methylation_24hrs`)) %>% 
  pivot_longer(cols = -c(gene_id,methylation_9hrs),names_to = "exp_stage",values_to="exp_count") %>% 
  mutate(exp_stage=factor(exp_stage,levels = c("gene_id","0-9hrs","11hrs","12hrs","14hrs","16hrs","20hrs","24hrs","24-60hrs"))) %>%
  mutate(exp_cat= ifelse(exp_count == 0, "0",
                         ifelse(between(exp_count,0,1),"≤ 1",
                                ifelse(between(exp_count,1,5),"≤ 5","≥ 5")))) %>% 
  filter(exp_stage %in% c("11hrs","12hrs","14hrs")) %>%  # filter(gene_id %in% methylatedGenes_9hrs) %>%
  mutate(exp_cat=factor(exp_cat,levels = c("0","≤ 1","≤ 5","≥ 5"))) %>%
  ggplot(aes(fill=exp_stage,x=exp_cat,y=methylation_9hrs)) + geom_boxplot(outlier.colour = "grey",outlier.size = 0.4,alpha=0.6)+
  ylim(NA,20) + # to print in notebook
  #scale_y_break(c(20, 95), scales = 0.01, ticklabels=c(95, 100)) + # to save figure
  scale_fill_manual(values=c("firebrick3","darkslategray","dodgerblue")) + 
  geom_hline(aes(yintercept = 3), color="chocolate4", linetype="dashed", size=0.5) +
  geom_hline(aes(yintercept = 3), color="chocolate4", linetype="dashed", size=0.5) +
  theme(legend.title=element_blank(),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=6),
        legend.position="top",legend.justification="right",legend.margin=margin(0,0,0,0),#legend.box.margin=margin(-20,-100,-10,-10),
        axis.text.x = element_text(size = 10,angle =0,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10), axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),axis.ticks.x.top = element_blank(),axis.line.x.top = element_blank()) + 
  ylab("Gene methylation % - 9hrs")  + xlab(NULL) #xlab("Gene expression TPM (Log2)") +

#tiff("ExpMethyl_cor_9hrs.tiff", width = 2.5, height = 3.5, units = "in",res = 300,compression = "lzw")
#ExpMethyl_cor_9hrs
#dev.off()
suppressWarnings(grid.arrange(ExpMethyl_cor_9hrs))

Gene expression at 12, 14 and 16 hrs embryos based on methylation from 11-12 hrs embryos - Fig 5I, middle

ExpMethyl_cor_12hrs <- embryo_Exp_Methyl_data %>% select(-c(`CpG_N_9hrs`:`CpG_N_24hrs`,`methylation_9hrs`,`methylation_24hrs`)) %>% 
  pivot_longer(cols = -c(gene_id,methylation_12hrs),names_to = "exp_stage",values_to="exp_count") %>% 
  mutate(exp_stage=factor(exp_stage,levels = c("gene_id","0-9hrs","11hrs","12hrs","14hrs","16hrs","20hrs","24hrs","24-60hrs"))) %>%
  mutate(exp_cat= ifelse(exp_count == 0, "0",
                         ifelse(between(exp_count,0,1),"≤ 1",
                                ifelse(between(exp_count,1,5),"≤ 5","≥ 5")))) %>% 
  filter(exp_stage %in% c("12hrs","14hrs","16hrs")) %>%
  mutate(exp_cat=factor(exp_cat,levels = c("0","≤ 1","≤ 5","≥ 5"))) %>%
  ggplot(aes(fill=exp_stage,x=exp_cat,y=methylation_12hrs)) + geom_boxplot(outlier.colour = "grey",outlier.size = 0.4,alpha=0.6)+
  ylim(NA,20) + # to print in notebook
  #scale_y_break(c(20, 95), scales = 0.01, ticklabels=c(95, 100)) + # to save figure
  scale_fill_manual(values=c("darkslategray","dodgerblue","darkorange1")) +  #scale_y_continuous(expand = c(0,0)) +
  geom_hline(aes(yintercept = 3), color="chocolate4", linetype="dashed", size=0.5) +
  geom_hline(aes(yintercept = 3), color="chocolate4", linetype="dashed", size=0.5) +
  theme(legend.title=element_blank(),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=6),
        legend.position="top",legend.justification="right",legend.margin=margin(0,0,0,0),#legend.box.margin=margin(-20,-100,-10,-10),
        axis.text.x = element_text(size = 10,angle =0,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10), axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),axis.ticks.x.top = element_blank(),axis.line.x.top = element_blank()) + 
  ylab("Gene methylation % - 11-12 hrs")  + xlab(NULL)   #xlab("Gene expression TPM (Log2)") + 

# tiff("ExpMethyl_cor_12hrs.tiff", width = 2.5, height = 3.5, units = "in",res = 300,compression = "lzw")
# ExpMethyl_cor_12hrs
# dev.off()
suppressWarnings(grid.arrange(ExpMethyl_cor_12hrs))

Gene expression at 20 and 24-60 hrs embryos based on methylation from 24-60 hrs embryos - Fig 5I, right


ExpMethyl_cor_24hrs <- embryo_Exp_Methyl_data %>% select(-c(`CpG_N_9hrs`:`CpG_N_24hrs`,`methylation_9hrs`,`methylation_12hrs`)) %>% 
  pivot_longer(cols = -c(gene_id,methylation_24hrs),names_to = "exp_stage",values_to="exp_count") %>% 
  mutate(exp_stage=factor(exp_stage,levels = c("gene_id","0-9hrs","11hrs","12hrs","14hrs","16hrs","20hrs","24hrs","24-60hrs"))) %>%
  mutate(exp_cat= ifelse(exp_count == 0, "0",
                         ifelse(between(exp_count,0,1),"≤ 1",
                                ifelse(between(exp_count,1,5),"≤ 5","≥ 5")))) %>% 
  filter(exp_stage %in% c("24hrs","24-60hrs")) %>%
  mutate(exp_cat=factor(exp_cat,levels = c("0","≤ 1","≤ 5","≥ 5"))) %>%
  ggplot(aes(fill=exp_stage,x=exp_cat,y=methylation_24hrs)) + geom_boxplot(outlier.colour = "grey",outlier.size = 0.4,alpha=0.6)+ 
  ylim(NA,20) + # to print in notebook
  #scale_y_break(c(20, 95), scales = 0.01, ticklabels=c(95, 100)) + # to save figure
  scale_fill_manual(values=c("forestgreen","gainsboro","darkorange1")) +  #scale_y_continuous(expand = c(0,0)) +
  geom_hline(aes(yintercept = 3), color="chocolate4", linetype="dashed", size=0.5) +
  geom_hline(aes(yintercept = 3), color="chocolate4", linetype="dashed", size=0.5) +
  theme(legend.title=element_blank(),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=6),
        legend.position="top",legend.justification="right",legend.margin=margin(0,0,0,0),#legend.box.margin=margin(-20,-100,-10,-10),
        axis.text.x = element_text(size = 10,angle =0,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10), axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),axis.ticks.x.top = element_blank(),axis.line.x.top = element_blank()) + 
  ylab("Gene methylation % - 24-60 hrs") + xlab(NULL)

# tiff("ExpMethyl_cor_24hrs.tiff", width = 2.5, height = 3.5, units = "in",res = 300,compression = "lzw")
# ExpMethyl_cor_24hrs
# dev.off()
suppressWarnings(grid.arrange(ExpMethyl_cor_24hrs))

9.1 Methylation for different developmental gene categories - Fig 5J

11-12 hrs methylation

# setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
gene_categories <- read.table("/drives/ssd1/manuel/phaw/2022_analysis/mzt_analysis/dev_genes_categories.csv",header = T,stringsAsFactors = F,sep=",")

my_comparisons = list(c("Minor wave all","Minor wave zygotic"), c("Maternal","Minor wave all"),
                      c("Major wave all","Major wave zygotic"),c("Maternal","Minor wave zygotic"))#,c("Maternal","Major wave zygotic"))

ZGA_methylation_12h  <- geneBody_full_methData %>% #replace(is.na(.), 0) %>%
  mutate(Class = ifelse(gene %in% gene_categories$pz_minor_wave,"Minor wave zygotic",
                      ifelse(gene %in% gene_categories$minor_wave_allGenes,"Minor wave all",
                               ifelse(gene %in% gene_categories$pz_major_wave,"Major wave zygotic", 
                                      ifelse(gene %in% gene_categories$maternal_genes_noZGA,"Maternal",
                                             ifelse(gene %in% gene_categories$major_wave_allGenes,"Major wave all","rest")))))) %>% 
  mutate(Class = factor(Class,levels = c("Maternal","Minor wave all","Minor wave zygotic","Major wave all","Major wave zygotic"))) %>%
  filter(Class != "rest",class== "Cov12hrs") %>% #group_by(Class) %>% summarise(meanss=mean(tx_ln))
  ggplot(aes(x=Class, y=methylation,fill=Class),size=0.3) + geom_jitter(size=0.3, alpha=0.3,color="grey") + geom_violin(alpha=0.5) + theme_minimal() +
  ylim(NA,30)+
  #scale_y_break(c(30, 60), scales = 0.1, ticklabels=c(60, 100)) +# scale_y_continuous(expand = c(0, 50)) +
  xlab(NULL) + ylab("Gene methylation - 11-12 hrs") +
  stat_summary(fun=median, geom="point", shape=23, size=1.5, color="black", fill="red",alpha=1)+
  scale_fill_manual(name=" ", values=c("navyblue", "#2B83BA", "orange","firebrick4","seagreen4")) +
  theme(axis.text.x = element_text(angle = 20, vjust = 0.9, hjust=0.75,size=10),
        axis.text.y = element_text(size=8), axis.title.x=element_blank(),legend.position="none",
        axis.title.y = element_text(size = 14),
        axis.text.y.right = element_blank(),
        axis.line.y.right = element_blank(),
        axis.ticks.y.right = element_blank())# + stat_compare_means(comparisons = my_comparisons,label = "p.signif",size=2.5,vjust = 0)# + stat_compare_means(label.y = 8, label.x=1.5, size=5) 
#ZGA_methylation_12h

# tiff("ZGA_methylation_12h.tiff", width = 3, height = 3.5, units = "in",res = 800,compression = "lzw")
# ZGA_methylation_12h
# dev.off()
suppressWarnings(grid.arrange(ZGA_methylation_12h))

24-60 hrs methylation

ZGA_methylation_24_60h  <- geneBody_full_methData %>% #replace(is.na(.), 0) %>%
  mutate(Class = ifelse(gene %in% gene_categories$pz_minor_wave,"Minor wave zygotic",
                      ifelse(gene %in% gene_categories$minor_wave_allGenes,"Minor wave all",
                               ifelse(gene %in% gene_categories$pz_major_wave,"Major wave zygotic", 
                                      ifelse(gene %in% gene_categories$maternal_genes_noZGA,"Maternal",
                                             ifelse(gene %in% gene_categories$major_wave_allGenes,"Major wave all","rest")))))) %>% 
  mutate(Class = factor(Class,levels = c("Maternal","Minor wave all","Minor wave zygotic","Major wave all","Major wave zygotic"))) %>%
  filter(Class != "rest",class== "Cov24hrs") %>% #group_by(Class) %>% summarise(meanss=mean(tx_ln))
  ggplot(aes(x=Class, y=methylation,fill=Class),size=0.3) + geom_jitter(size=0.3, alpha=0.3,color="grey") + geom_violin(alpha=0.5) + theme_minimal() +
  ylim(NA,30)+
  #scale_y_break(c(30, 60), scales = 0.1, ticklabels=c(60, 100)) +# scale_y_continuous(expand = c(0, 50)) +
  xlab(NULL) + ylab("Gene methylation - 24-60 hrs") +
  stat_summary(fun=median, geom="point", shape=23, size=1.5, color="black", fill="red",alpha=1)+
  scale_fill_manual(name=" ", values=c("navyblue", "#2B83BA", "orange","firebrick4","seagreen4")) +
  theme(axis.text.x = element_text(angle = 20, vjust = 0.9, hjust=0.75,size=10),
        axis.text.y = element_text(size=8), axis.title.x=element_blank(),legend.position="none",
        axis.title.y = element_text(size = 14),
        axis.text.y.right = element_blank(),
        axis.line.y.right = element_blank(),
        axis.ticks.y.right = element_blank())# + stat_compare_means(comparisons = my_comparisons,label = "p.signif",size=2.5,vjust = 0)# + stat_compare_means(label.y = 8, label.x=1.5, size=5) 

suppressWarnings(grid.arrange(ZGA_methylation_24_60h))

---
title: "3_9 Whole-genome DNA methylation dynamics during *Parhyale* embryogenesis"
author: "Manuel Jara-Espejo"
output:
  html_notebook:
    number_sections: true
    code_folding: hide
---

# Introduction
This notebook describes the analysis and interpretation of Methyl-Seq data generated for three developmental timepoints:
Stage1 (S1): 0-9.15 hrs
Stage2 (S1): 11-12 hrs
Stage3 (S1): 24-60 hrs

# Methylation data processing 

## Quality check using *FastQC*

```{bash}
methyl_seq_raw=Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1.fq.gz # Example: Stage 1, Reads_1
echo "Stage1 raw reads_1:" $methyl_seq_raw
#./software/FastQC/fastqc $methyl_seq_raw Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1.fq.gz 
```

## Reads trimming using *TrimGalore*

```{bash}
#Running on stage1 data
~/software/TrimGalore-0.6.6/trim_galore --paired  Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1.fq.gz Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_2.fq.gz

echo "Stage1 clean reads: Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1.fq Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_2_val_2.fq"
```

## Mapping reads to *Phaw5.1* genome

Running bismark_genome_preparation

```{bash}
$phaw_genome=/drives/ssd1/wei/genome/phaw_sambaAsm.scaff_seqs_editedScafNames.fa
~/software/Bismark-0.22.3/bismark_genome_preparation --path_to_bowtie ~/software/bowtie2/bowtie2-2.4.2-sra-linux-x86_64/  
--verbose --bowtie2 $phaw_genome
```

Running bismark_mapping

```{bash}
~/software/Bismark-0.22.3/bismark --bowtie2 --path_to_bowtie2 ~/software/bowtie2/bowtie2-2.4.2-sra-linux-x86_64/ 
--samtools_path ~/software/samtools-1.17 --genome /drives/ssd1/wei/genome -N 1 -L 22 --score_min L,0,-0.6  -1 ./Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1.fq -2 ./Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_2_val_2.fq

echo "Stage1 mapped reads: Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1_bismark_bt2_pe.bam"
```

## Deduplication

```{bash}
~/software/Bismark-0.22.3/deduplicate_bismark -p --bam Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1_bismark_bt2_pe.bam
echo "Stage1 deduplicated reads: Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1_bismark_bt2_pe.deduplicated.bam"
```

## Methylation calling

```{bash}
#~/software/Bismark-0.22.3/bismark_methylation_extractor -p --comprehensive --samtools_path ~/software/samtools-1.17 --bedGraph --scaffolds  --cytosine_report --genome_folder /drives/ssd1/wei/genome/ Stage1_2_EKDL200012840-1a_HHHFMDSXY_L2_1_val_1_bismark_bt2_pe.deduplicated.bam
echo "Covered CpGs - Stage1"
head -5 /drives/ssd1/wei/emseq/map/nocut/S1/S1_CpGsum
```

# Summarise methylation level per stage

## CpG coverage for all CPGs in the genome across stages - Fig S13A

```{r}
knitr::opts_chunk$set(message=F,warning=F,echo=T)
#Load libraries
library(tidyr)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(reshape2)
library(ggpubr)
library(colortools)
library(Rsubread)
library(tibble)
library(scales)
library(ggbreak) 
library(VennDiagram)
library("RColorBrewer")
library(gridExtra)
library(methylKit)
```


```{r}
#Coverage for all CPGs in the genome
covAllCpG_9hr <- read.table("/drives/ssd1/wei/formanuel/S1_CpGsum",header = F) #Covered CpGs stage 0-9:15 hrs
covAllCpG_12hr <- read.table("/drives/ssd1/wei/formanuel/S2_CpGsum",header = F) #Covered CpGs stage 11:12 hrs
covAllCpG_24_60hr <- read.table("/drives/ssd1/wei/formanuel/S3_CpGsum",header = F) #Covered CpGs stage 24:60 hrs

#Merge coverage data
covAllCpG_allStages <- dplyr::bind_rows(list(covAllCpG_9hr,covAllCpG_12hr,covAllCpG_24_60hr), .id = 'stage') %>% 
  mutate(stage = recode(stage, '1' = 'emb_9hr', '2' = 'emb_12hr', '3' = 'emb_24_60hr')) %>%
  mutate(coverage = (V4 + V5))
colnames(covAllCpG_allStages) <- c("stage","Scaffold","pos","strand","mCpG_reads","UnmCpG_reads","methylation_level","coverage")

rm(covAllCpG_9hr,covAllCpG_12hr,covAllCpG_24_60hr)
```


```{r}
covAllCpG_allStages %>% group_by(stage) %>% summarise(covMedian = median(coverage),covMean = mean(coverage))

covAllCpG_allStages_distribution <- covAllCpG_allStages  %>% mutate(stage=factor(stage, levels = c('emb_9hr','emb_12hr','emb_24_60hr'))) %>%
  filter(coverage <= 50) %>%
  mutate(coverage=factor(coverage, levels = c(seq(0,50,1)))) %>%
  ggplot(aes(x=coverage,fill=stage)) +  geom_bar(stat='count', position='dodge') +
  scale_x_discrete(labels=c(seq(0,50,10)),breaks =seq(0,50,10) ) + 
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) +
  scale_y_continuous(expand = c(0,0),labels=unit_format(unit = "M",scale=1e-6))+ ylab("Counts (million)") +xlab("WGBS-seq coverage") +
  #scale_x_discrete(labels= c("0","20","40","60","80","100")) +
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 13,angle =0,vjust = 0.65),
        axis.text.y = element_text(size = 13),legend.position=c(0.8,0.85),
        axis.title = element_text(size = 20),strip.text.x = element_text(size = 12)) 

#tiff("covAllCpG_allStages_distribution.tiff", width = 5, height = 4, units = "in",res = 400,compression = "lzw")
covAllCpG_allStages_distribution 
#dev.off()
```
## Time-point specific fractional methylation levels -  Fig 5A

```{r}
# Plot average methylation for all covered CpGa (counts >= 5) 
meth_allCovCpG_allStages_perStage <- covAllCpG_allStages %>% filter(coverage >= 5) %>% 
  mutate(stage=factor(stage, levels = c('emb_9hr','emb_12hr','emb_24_60hr'))) %>% 
  ggplot(aes(x = stage, y = methylation_level,fill= stage)) + geom_bar( position='dodge',stat = "summary", fun = "mean") +
  scale_x_discrete(labels= c("9 hrs","11-12 hrs","24-60 hrs")) +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position="none",
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),axis.ticks.x.top = element_blank(),axis.line.x.top = element_blank()) + 
  xlab(NULL) + ylab("Average methylation %") + scale_y_continuous(expand = c(0,0)) 
  #scale_y_break(c(10, 90), scales = 0.03, ticklabels=c(90, 100))

#tiff("methyl_level_perStage_allCovCpG.tiff", width = 1.5, height = 3, units = "in",res = 400,compression = "lzw")
meth_allCovCpG_allStages_perStage
#dev.off()
```

## Methylation level distribution for mCpGs (methylation >= 20) -  Fig S13B

```{r}
mCpg_methylDistribution <- covAllCpG_allStages %>% filter(coverage >= 5, methylation_level >= 20) %>% 
  mutate(stage=factor(stage, levels = c('emb_9hr','emb_12hr','emb_24_60hr'))) %>% 
  ggplot() + 
  geom_histogram(aes(x=methylation_level,y=..count..)) + facet_wrap(~stage) +
  scale_y_continuous(expand = c(0,0),labels=unit_format(unit = "M",scale=1e-6),limits = c(0,1400000))+ 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 13,angle =30,vjust = 0.65),
        axis.text.y = element_text(size = 13),legend.position=c(0.1,0.85),
        axis.title = element_text(size = 20),strip.text.x = element_text(size = 12)) 

#tiff("mCpg_methylDistribution.tiff", width = 10, height = 4, units = "in",res = 400,compression = "lzw")
mCpg_methylDistribution 
#dev.off()
```
## Methylation by genomic feature - Fig 5B and S13D

```{r}
#Open with summarised methylation data per stage and feature
perFeature_MethStats <- read.table("../wgbs_analysis/wholegenome_te",header = T)
colnames(perFeature_MethStats) <- c("region","5XCov_CpGs","mCpGs","mCpGs_percent","methylation_level","stage","Group")
perFeature_MethStats
```

```{r}
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
#Plot fraction of methylated CpGs per feature - Fig 13D
mCpG_percent_byFeature <- perFeature_MethStats %>% mutate(stage = recode(stage, 'S1' = '9 hrs', 'S2' = '11-12 hrs', 'S3' = '24_60 hrs')) %>%
  ggplot(aes(x=region , y = mCpGs_percent, fill=stage)) +
  geom_col(position = "dodge",colour = "black", size = 0.1) +
  facet_wrap(~ Group,scales="free_x") +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + scale_y_continuous(expand = c(0,0)) +
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position=c(0.1,0.85),legend.background = element_rect(fill='transparent'),
        axis.title = element_text(size = 14),#strip.text.x = element_text(size = 12),
        strip.background = element_blank(),
        strip.text.x = element_blank()) + xlab(NULL) + ylab("Percent of mCpG") 

#Plot fractional methylation per feature - Fig 5B 
mCpG_level_byFeature <- perFeature_MethStats %>% mutate(stage = recode(stage, 'S1' = '9 hrs', 'S2' = '11-12 hrs', 'S3' = '24_60 hrs')) %>%
  ggplot(aes(x=region , y = methylation_level, fill=stage)) +
  geom_col(position = "dodge",colour = "black", size = 0.1) +
  facet_wrap(~ Group,scales="free_x") +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position=c(0.1,0.85),legend.background = element_rect(fill='transparent'),
        axis.title = element_text(size = 14),  strip.background = element_blank(),
        strip.text.x = element_blank()) + xlab(NULL) + ylab("Methylation %") 

#tiff("mCpG_percent_byFeature.tiff", width = 5, height = 3, units = "in",res = 400,compression = "lzw")
mCpG_percent_byFeature #%>% filter(stage =="emb_9hr") %>% filter(methylation_level == "100" ) %>% dim()
#dev.off()

#tiff("mCpG_level_byFeature.tiff", width = 5, height = 3, units = "in",res = 400,compression = "lzw")
mCpG_level_byFeature #%>% filter(stage =="emb_9hr") %>% filter(methylation_level == "100" ) %>% dim()
#dev.off()
```
## Distribution of covered CpGs across genomic features - Fig S13C

```{r}
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
perGeneFeat_MethStats_allStages <- perFeature_MethStats %>% mutate(stage = recode(stage, 'S1' = '9 hrs', 'S2' = '11-12 hrs', 'S3' = '24-60 hrs')) %>%
  filter(Group == "Gene",region != "Genebody") %>% tibble() %>% 
  ggplot(aes(x = stage, y = `5XCov_CpGs`,fill= region)) + geom_bar(stat='identity', position='fill') +
  scale_y_continuous(expand = c(0,0))+ ylab("Covered CpGs fraction") + xlab(NULL) + scale_fill_brewer(palette = "Set1")+
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 13,angle =30,vjust = 0.65),
        axis.text.y = element_text(size = 13),#legend.position=c(0.1,0.85),
        axis.title = element_text(size = 20),strip.text.x = element_text(size = 12)) 

perTEs_MethStats_allStages <-perFeature_MethStats %>% mutate(stage = recode(stage, 'S1' = '9 hrs', 'S2' = '11-12 hrs', 'S3' = '24-60 hrs')) %>%
  filter(Group == "TE",region != "TE") %>% tibble() %>% 
  ggplot(aes(x = stage, y = `5XCov_CpGs`,fill= region)) + geom_bar(stat='identity', position='fill') +
  scale_y_continuous(expand = c(0,0))+ ylab("Covered CpGs fraction") + xlab(NULL) + scale_fill_brewer(palette = "Dark2")+
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 13,angle =30,vjust = 0.55),
        axis.text.y = element_text(size = 13),#legend.position=c(0.1,0.85),
        axis.title = element_text(size = 20),strip.text.x = element_text(size = 12)) 

#tiff("MethStats_allStages.tiff", width = 6.5, height = 5, units = "in",res = 400,compression = "lzw")
ggarrange(perGeneFeat_MethStats_allStages,perTEs_MethStats_allStages,nrow = 1,ncol = 2)

#dev.off()
```

# Dynamics of DNA Methylomes for Early Embryos
I aimed to identify differential methylated CpGs between stages and next interrogate if this changes are prevalent in specific genomic regions. Only included in the analysis CpGs with coverage >=5

## Read the files to a methylRawList object: myobj

```{r}
methylseq_dir <- "/drives/ssd1/wei/emseq/sum"
methylseq_files <- grep("CpG_report.txt_CG_methykit.txt2$",list.files(methylseq_dir,full.names = T),value=TRUE)

myobj=methRead(as.list(methylseq_files[3:6]),
           sample.id=list("s11h","s11h","s24h","s24h"),
           assembly="phaw51",
           treatment=c(0,0,1,1),
           context="CpG",mincov = 5)
```


```{r}
head(myobj[[4]])
```
Quality check

```{r fig.width=8, fig.height=8}
getMethylationStats(myobj[[1]],plot=TRUE,both.strands=FALSE) #9hrs
getMethylationStats(myobj[[2]],plot=TRUE,both.strands=FALSE) #9hrs
getMethylationStats(myobj[[3]],plot=TRUE,both.strands=FALSE) #11-12hrs
getMethylationStats(myobj[[4]],plot=TRUE,both.strands=FALSE) #11-12hrs
```
Merging samples into a single table
 
```{r}
meth=methylKit::unite(myobj, destrand=FALSE)
```

```{r}
clusterSamples(meth, dist="correlation", method="ward", plot=TRUE)
```
```{r}
pc=PCASamples(meth,obj.return = TRUE, adj.lim=c(1,1))
```


## Filtering CpGs based on variation

Calculate per CpG variation

```{r}
pm=percMethylation(meth) # get percent methylation matrix
mds=matrixStats::rowSds(pm) # calculate standard deviation of CpGs
head(meth[mds>5,])
```
Histogram of CpG variation

```{r}
hist(mds,col="gainsboro",xlab="Std. dev. per CpG",breaks = 100)
```

Keep CpGs with standard deviations larger than 2% - Cluster samples

```{r}
meth <- meth[mds>2,]
print(paste0("# of CpGs forn downstream analysis: ",nrow(meth)))
```

## Extracting differential methylated CpGs. 9hrs to 11 hrs transtion
Samples per conditions were pulled and Fisher's exact test was applied.

```{r}
getSampleID(meth)
pooled.meth=pool(meth,sample.ids=c("s11h","s24h"))
dm.pooledf=calculateDiffMeth(pooled.meth)
```

Get differentially methylated bases/regions 

```{r}
# All sites
all.diff=getMethylDiff(dm.pooledf,difference=5,qvalue=0.05,type="all")

# get hyper-methylated
hyper=getMethylDiff(dm.pooledf,difference=5,qvalue=0.05,type="hyper")

# get hypo-methylated
hypo=getMethylDiff(dm.pooledf,difference=5,qvalue=0.05,type="hypo")
```

## Differentially methylated Regions

Tile the genome with windows of 1000bp length and 1000 bp step-size 

# Gene body (+-2kb) methylation analysis - Fig 5C

Process and summarise methylation per window across genes
Each gene was split into 100 windows, CpGs were mapped to each window and the average methylation per window was estimated. The same process was performed on 2kb upstream and downstream regions, which were split into 20 windows

Summarising gene body methylation per stage

```{r}
tiles=tileMethylCounts(myobj,win.size=1000,step.size=1000)
#head(tiles[[1]],3)
```

Combine into single object

```{r}
meth_byRegion=methylKit::unite(tiles, destrand=FALSE)
```

Extracting differential methylated regions. 11hrs to 24hrs transtion
Samples per conditions were pulled and Fisher's exact test was applied.

```{r}
getSampleID(meth_byRegion)
pooled.methRegion=pool(meth_byRegion,sample.ids=c("s11h","s9h"))
dm.pooledf_byRegion=calculateDiffMeth(pooled.methRegion)
```

```{r}
# All sites
all.diff=getMethylDiff(dm.pooledf_byRegion,difference=5,qvalue=0.05,type="all")
print(paste0("All diffMeth:",nrow(all.diff)))

# get hyper-methylated
hyper=getMethylDiff(dm.pooledf_byRegion,difference=5,qvalue=0.05,type="hyper")
print(paste0("All diffMeth:",nrow(hyper)))

# get hypo-methylated
hypo=getMethylDiff(dm.pooledf_byRegion,difference=5,qvalue=0.05,type="hypo")
print(paste0("All diffMeth:",nrow(hypo)))

```

Intersect with genic and TE elements

```{r}
#Call window-based methylation files per stage
s1_genebody_w100 <- read.table("/drives/ssd1/wei/genome/genebody/nostrand/S1_genebody_win100_5_sum", header = F, stringsAsFactors = F)
colnames(s1_genebody_w100) <- c("gene_id","window","cpg_count","methylation")
s2_genebody_w100 <- read.table("/drives/ssd1/wei/genome/genebody/nostrand/S2_genebody_win100_5_sum", header = F, stringsAsFactors = F)
colnames(s2_genebody_w100) <- c("gene_id","window","cpg_count","methylation")
s3_genebody_w100 <- read.table("/drives/ssd1/wei/genome/genebody/nostrand/S3_genebody_win100_5_sum", header = F, stringsAsFactors = F)
colnames(s3_genebody_w100) <- c("gene_id","window","cpg_count","methylation")

# Summarise files, estimating window-based average methylation across genes
s1_genebody_w100 <- s1_genebody_w100 %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s1_genebody_w100 <- s1_genebody_w100[,c("gene_id",paste0("w",seq(1:100)))]

s2_genebody_w100 <- s2_genebody_w100  %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s2_genebody_w100 <- s2_genebody_w100[,c("gene_id",paste0("w",seq(1:100)))]

s3_genebody_w100 <- s3_genebody_w100  %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s3_genebody_w100 <- s3_genebody_w100[,c("gene_id",paste0("w",seq(1:100)))]

# Final data
s1_genebody_w100 <- s1_genebody_w100 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s2_genebody_w100 <- s2_genebody_w100 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s3_genebody_w100 <- s3_genebody_w100 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
```


```{r}
s1_genebody_w100
s2_genebody_w100
s3_genebody_w100
```

Summarizing 2kb downstream gene region per stage

```{r}
#Call window-based methylation files per stage
s1_down_20 <- read.table("/drives/ssd1/wei/genome/down/nostrand/S1_down2kb_win20_5_sum", header = F, stringsAsFactors = F)
colnames(s1_down_20) <- c("gene_id","window","cpg_count","methylation")
s2_down_20 <- read.table("/drives/ssd1/wei/genome/down/nostrand/S2_down2kb_win20_5_sum", header = F, stringsAsFactors = F)
colnames(s2_down_20) <- c("gene_id","window","cpg_count","methylation")
s3_down_20 <- read.table("/drives/ssd1/wei/genome/down/nostrand/S3_down2kb_win20_5_sum", header = F, stringsAsFactors = F)
colnames(s3_down_20) <- c("gene_id","window","cpg_count","methylation")

# Summarise files, estimating window-based average methylation across genes
s1_down_20 <- s1_down_20 %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s1_down_20 <- s1_down_20[,c("gene_id",paste0("w",seq(1:20)))]

s2_down_20 <- s2_down_20  %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s2_down_20 <- s2_down_20[,c("gene_id",paste0("w",seq(1:20)))]

s3_down_20 <- s3_down_20  %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s3_down_20 <- s3_down_20[,c("gene_id",paste0("w",seq(1:20)))]

# Final data
s1_down_20 <- s1_down_20 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s2_down_20 <- s2_down_20 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s3_down_20 <- s3_down_20 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
```


```{r}
s1_down_20
s2_down_20
s3_down_20
```

Summarizing 2kb upstream gene region per stage

```{r}
#Call window-based methylation files per stage
s1_up_20 <- read.table("/drives/ssd1/wei/genome/up/nostrand/S1_up2kb_win20_5_sum", header = F, stringsAsFactors = F)
colnames(s1_up_20) <- c("gene_id","window","cpg_count","methylation")
s2_up_20 <- read.table("/drives/ssd1/wei/genome/up/nostrand/S2_up2kb_win20_5_sum", header = F, stringsAsFactors = F)
colnames(s2_up_20) <- c("gene_id","window","cpg_count","methylation")
s3_up_20 <- read.table("/drives/ssd1/wei/genome/up/nostrand/S3_up2kb_win20_5_sum", header = F, stringsAsFactors = F)
colnames(s3_up_20) <- c("gene_id","window","cpg_count","methylation")

# Summarise files, estimating window-based average methylation across genes
s1_up_20 <- s1_up_20 %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s1_up_20 <- s1_up_20[,c("gene_id",paste0("w",seq(1:20)))]

s2_up_20 <- s2_up_20  %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s2_up_20 <- s2_up_20[,c("gene_id",paste0("w",seq(1:20)))]

s3_up_20 <- s3_up_20  %>%
  mutate(window=paste0("w",window)) %>% pivot_wider(- cpg_count,names_from = window,values_from = methylation) 
s3_up_20 <- s3_up_20[,c("gene_id",paste0("w",seq(1:20)))]

# Final data
s1_up_20 <- s1_up_20 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s2_up_20 <- s2_up_20 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
s3_up_20 <- s3_up_20 %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)
```


```{r}
s1_up_20
s2_up_20
s3_up_20
```

Merge upstream, gene_body and downstream average methylation data

```{r}
up_gene_down_meth <- rbind(
cbind(s1_up_20,s1_genebody_w100,s1_down_20),
cbind(s2_up_20,s2_genebody_w100,s2_down_20),
cbind(s3_up_20,s3_genebody_w100,s3_down_20)) #%>% mutate(stage= c("S1","S2","S3"))
colnames(up_gene_down_meth) <- paste0("w",seq(1:140))
up_gene_down_meth <- up_gene_down_meth %>% dplyr::mutate(stage= c("S1","S2","S3")) %>% pivot_longer(-stage)
up_gene_down_meth
```

Plot

```{r}
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
geneBody_upDown_meth_plot <-up_gene_down_meth %>% 
  mutate(stage=factor(stage, levels= c("S1","S2","S3")),
         name=factor(name,levels =paste0("w",seq(1:140)))) %>%
  ggplot(aes(y=value ,x=name, group=stage,color=stage)) + geom_line(size=0.8)  +
  xlab("") +  ylab("Methylation (%)") +
  ggtitle("") +
  scale_x_discrete(breaks = c('w8','w20','w70','w120','w135'), 
                     labels=c("Upstream","TSS","Gene body","TES","Downstream")) +
  scale_color_manual(values = c("orange2","mistyrose4","maroon"),
                     breaks = c("S1","S2","S3"),
                     labels = c("0-9 hrs","11-12 hrs","24-60 hrs")) + theme_bw() +
  theme(panel.grid.major = element_line(colour=NA),
        panel.background = element_rect(fill= "transparent", colour=NA),
        plot.background = element_rect(fill = "transparent",colour = NA),
        panel.grid.minor = element_blank(),
        legend.position = c(0.87,0.65),
        legend.title = element_blank(),
        legend.key.size = unit(15,"pt"),
        legend.text=element_text(size=8),
        axis.text.y = element_text(size = 10),axis.text.x = element_text(size = 10,vjust = 1.15,angle=30,hjus=1.2),
        axis.title.x = element_text(size = 14),axis.title.y = element_text(size = 14))

#tiff("geneBody_upDown_meth.tiff", width = 4, height = 2.5, units = "in",res = 400,compression = "lzw")
geneBody_upDown_meth_plot
#dev.off()
```

# Exon-intron methylation level - Fig 5D
Each exon was split in 30 windows, while each intron was split in 80 windows. For each windows, the average CpG methylation was estimated. The graph represent the average methylation per window across genes, using only the first four exons and introns.

Exon methylation 

```{r}
#Call window-based methylation files per stage
s1_exon_30 <- read.table("/drives/ssd1/wei/genome/exon/nostrand/S1_exon_modify_win30_5_sum", header = F, stringsAsFactors = F)
colnames(s1_exon_30) <- c("gene_id","exon_n","window","cpg_count","methylation")
s2_exon_30 <- read.table("/drives/ssd1/wei/genome/exon/nostrand/S2_exon_modify_win30_5_sum", header = F, stringsAsFactors = F)
colnames(s2_exon_30) <- c("gene_id","exon_n","window","cpg_count","methylation")
s3_exon_30 <- read.table("/drives/ssd1/wei/genome/exon/nostrand/S3_exon_modify_win30_5_sum", header = F, stringsAsFactors = F)
colnames(s3_exon_30) <- c("gene_id","exon_n","window","cpg_count","methylation")

```

```{r}
s1_exon_30 %>% head()
```


```{r}
#Summarised intron methylation - Stage1
exon_methy_list_s1 = list()
for (i in 1:4) { # Only extract data for first 4 introns
  temp_df <- s1_exon_30 %>% mutate(window=paste0("w",window)) %>% filter(exon_n == i) %>% 
    pivot_wider(- cpg_count,names_from = window,values_from = methylation) %>% select(-exon_n)
  temp_df <- temp_df[,c("gene_id",paste0("w",seq(1:30)))]
  temp_df <- temp_df %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)

exon_methy_list_s1[[i]] <- temp_df
}

#Summarised intron methylation - Stage2
exon_methy_list_s2 = list()
for (i in 1:4) { # Only extract data for first 4 introns
  temp_df <- s2_exon_30 %>% mutate(window=paste0("w",window)) %>% filter(exon_n == i) %>% 
    pivot_wider(- cpg_count,names_from = window,values_from = methylation) %>% select(-exon_n)
  temp_df <- temp_df[,c("gene_id",paste0("w",seq(1:30)))]
  temp_df <- temp_df %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)

exon_methy_list_s2[[i]] <- temp_df
}

#Summarised intron methylation - Stage2
exon_methy_list_s3 = list()
for (i in 1:4) { # Only extract data for first 4 introns
  temp_df <- s3_exon_30 %>% mutate(window=paste0("w",window)) %>% filter(exon_n == i) %>% 
    pivot_wider(- cpg_count,names_from = window,values_from = methylation) %>% select(-exon_n)
  temp_df <- temp_df[,c("gene_id",paste0("w",seq(1:30)))]
  temp_df <- temp_df %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)

exon_methy_list_s3[[i]] <- temp_df
}
```

```{r}
exon_methy_list_s1
```


Intron methylation 

```{r}
#Call window-based methylation files per stage
s1_intron_80 <- read.table("/drives/ssd1/wei/genome/intron/nostrand/S1_intron_modify_win80_5_sum", header = F, stringsAsFactors = F)
colnames(s1_intron_80) <- c("gene_id","intron_n","window","cpg_count","methylation")
s2_intron_80 <- read.table("/drives/ssd1/wei/genome/intron/nostrand/S2_intron_modify_win80_5_sum", header = F, stringsAsFactors = F)
colnames(s2_intron_80) <- c("gene_id","intron_n","window","cpg_count","methylation")
s3_intron_80 <- read.table("/drives/ssd1/wei/genome/intron/nostrand/S3_intron_modify_win80_5_sum", header = F, stringsAsFactors = F)
colnames(s3_intron_80) <- c("gene_id","intron_n","window","cpg_count","methylation")

```

```{r}
s1_intron_80 %>% head()
```


```{r}
#Summarised intron methylation - Stage1
intron_methy_list_s1 = list()
for (i in 1:4) { # Only extract data for first 4 introns
  temp_df <- s1_intron_80 %>% mutate(window=paste0("w",window)) %>% filter(intron_n == i) %>% 
    pivot_wider(- cpg_count,names_from = window,values_from = methylation) %>% select(-intron_n)
  temp_df <- temp_df[,c("gene_id",paste0("w",seq(1:80)))]
  temp_df <- temp_df %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)

intron_methy_list_s1[[i]] <- temp_df
}

#Summarised intron methylation - Stage2
intron_methy_list_s2 = list()
for (i in 1:4) { # Only extract data for first 4 introns
  temp_df <- s2_intron_80 %>% mutate(window=paste0("w",window)) %>% filter(intron_n == i) %>% 
    pivot_wider(- cpg_count,names_from = window,values_from = methylation) %>% select(-intron_n)
  temp_df <- temp_df[,c("gene_id",paste0("w",seq(1:80)))]
  temp_df <- temp_df %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)

intron_methy_list_s2[[i]] <- temp_df
}

#Summarised intron methylation - Stage2
intron_methy_list_s3 = list()
for (i in 1:4) { # Only extract data for first 4 introns
  temp_df <- s3_intron_80 %>% mutate(window=paste0("w",window)) %>% filter(intron_n == i) %>% 
    pivot_wider(- cpg_count,names_from = window,values_from = methylation) %>% select(-intron_n)
  temp_df <- temp_df[,c("gene_id",paste0("w",seq(1:80)))]
  temp_df <- temp_df %>% select(-gene_id) %>% summarise_all(mean,na.rm = TRUE)

intron_methy_list_s3[[i]] <- temp_df
}
```

```{r}
intron_methy_list_s1
```

Merge exon and intron  methylation data

```{r}
exon_intron_meth <- rbind(cbind(exon_methy_list_s1[[1]],intron_methy_list_s1[[1]],exon_methy_list_s1[[2]],intron_methy_list_s1[[2]],
                          exon_methy_list_s1[3],intron_methy_list_s1[[3]],exon_methy_list_s1[[4]],intron_methy_list_s1[[4]]),
                    cbind(exon_methy_list_s2[[1]],intron_methy_list_s2[[1]],exon_methy_list_s2[[2]],intron_methy_list_s2[[2]],
                          exon_methy_list_s2[3],intron_methy_list_s2[[3]],exon_methy_list_s2[[4]],intron_methy_list_s2[[4]]),
                    cbind(exon_methy_list_s3[[1]],intron_methy_list_s3[[1]],exon_methy_list_s3[[2]],intron_methy_list_s3[[2]],
                          exon_methy_list_s3[3],intron_methy_list_s3[[3]],exon_methy_list_s3[[4]],intron_methy_list_s3[[4]]))

colnames(exon_intron_meth) <- paste0("w",seq(1:440))
exon_intron_meth <- exon_intron_meth %>% dplyr::mutate(stage= c("S1","S2","S3")) %>% pivot_longer(-stage)
exon_intron_meth
```


```{r}
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
exonIntron_meth_plot <- exon_intron_meth %>% 
  mutate(stage=factor(stage, levels= c("S1","S2","S3")),
         name=factor(name,levels =paste0("w",seq(1:440)))) %>%
  ggplot(aes(y=value ,x=name, group=stage,color=stage)) + geom_line(size=0.8)  +
  xlab("") +  ylab("Methylation (%)") + ggtitle(NULL) +
  scale_x_discrete(breaks = paste0('w',c(1,15,30,70,110,125,140,180,220,235,250,290,330,345,360,400,440)), 
                     labels=c(" ","Exon1"," ","Intron1"," ","Exon2"," ","Intron2"," ","Exon3"," ","Intron3"," ","Exon4"," ","Intron4"," ")) +
  scale_color_manual(values = c("orange2","mistyrose4","maroon"),
                     breaks = c("S1","S2","S3"),
                     labels = c("0-9 hrs","11-12 hrs","24-60 hrs")) + theme_bw() +
  theme(panel.grid.major = element_line(colour=NA),
        panel.background = element_rect(fill= "transparent", colour=NA),
        plot.background = element_rect(fill = "transparent",colour = NA),
        panel.grid.minor = element_blank(), legend.position = c(0.87,0.65), legend.title = element_blank(),
        legend.key.size = unit(12,"pt"), legend.text=element_text(size=7),
        axis.text.y = element_text(size = 10),axis.text.x = element_text(size = 8,vjust = 0.5),
        axis.title.x = element_text(size = 14),axis.title.y = element_text(size = 14))

#tiff("exonIntron_meth.tiff", width = 4, height = 2, units = "in",res = 400,compression = "lzw")
exonIntron_meth_plot
#dev.off()
```

# Analysis of covered Cpgs, %of mCpg and methylation level for whole gene length and first 2500 bp

Load data

```{r}
#CpGs covered along full gene body - Number and methylation average per gene
s1_covCpG_count <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s1_covCpG_count",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s2_covCpG_count <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s2_covCpG_count",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s3_covCpG_count <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s3_covCpG_count",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')

#mCpGs along full gene body - Number and methylation average per gene

s1_mCpG_count <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s1_mCpG_count",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s2_mCpG_count <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s2_mCpG_count",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s3_mCpG_count<-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s3_mCpG_count",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')

#CpGs covered along 5' 2500 bp of each gene - Number and methylation average per gene
s1_covCpG_count_first2500bp <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s1_covCpG_count_first2500bp",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s2_covCpG_count_first2500bp <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s2_covCpG_count_first2500bp",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s3_covCpG_count_first2500bp <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s3_covCpG_count_first2500bp",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')

#mCpGs along 5' 2500 bp of each gene - Number and methylation average per gene

s1_mCpG_count_first2500bp <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s1_mCpG_count_first2500bp",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s2_mCpG_count_first2500bp <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s2_mCpG_count_first2500bp",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')
s3_mCpG_count_first2500bp <-  read.table("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/s3_mCpG_count_first2500bp",header = F,sep= "\t", stringsAsFactors = F,na.strings = '.')

```

Merge and summarise whole gene body methylation data

```{r}
geneBody_full_methData <- dplyr::bind_rows(list(s1_covCpG_count,s2_covCpG_count,s3_covCpG_count,s1_mCpG_count,s2_mCpG_count,s3_mCpG_count), .id = 'class') %>%
  mutate(class = recode(class, '1' = 'Cov9hrs', '2' = 'Cov12hrs', '3' = 'Cov24hrs',
                        '4' = 'mCpG9hrs', '5' = 'mCpG12hrs', '6' = 'mCpG24hrs'))
colnames(geneBody_full_methData) <- c("class","gene","group","strand","CpG_N","methylation")
geneBody_full_methData %>% head()
```

Estimate percentage of gene body methylated CpGs (mCpG) - Fig 5E left

```{r  fig1, fig.height = 3, fig.width = 1.5}
knitr::opts_chunk$set(message=F,warning=F,echo=T)
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
geneBody_mcpg_perc <- geneBody_full_methData %>% select(-c(group,strand)) %>%
  pivot_wider(names_from=class,values_from= c(CpG_N,methylation)) %>% 
  mutate(mCpG_perc9hr= (CpG_N_mCpG9hrs/CpG_N_Cov9hrs)*100,
         mCpG_perc12hr= (CpG_N_mCpG12hrs/CpG_N_Cov12hrs)*100,
         mCpG_perc24hr= (CpG_N_mCpG24hrs/CpG_N_Cov24hrs)*100) %>% select(gene,mCpG_perc9hr,mCpG_perc12hr,mCpG_perc24hr) %>%
  melt(id.vars="gene",value.name = "mCpG_perc") %>% 
  ggplot(aes(x=variable,y=mCpG_perc,fill=variable)) + geom_boxplot(outlier.colour = "grey",size=0.4) +
  scale_x_discrete(labels= c("9 hrs","11-12 hrs","24-60 hrs")) +
  scale_y_break(c(40, 95), scales = 0.1, ticklabels=c(95, 100)) + ylab("Gene body mCpG %") +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position="none",
        axis.text.x.top = element_blank(),
        axis.ticks.x.top = element_blank(),
        axis.line.x.top = element_blank(),
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12)) + xlab(NULL) + ylab(NULL)

suppressWarnings(grid.arrange(geneBody_mcpg_perc))

#tiff("geneBody_mcpg_perc.tiff", width = 1.5, height = 3, units = "in",res = 400,compression = "lzw")
#geneBody_mcpg_per
#dev.off()
```

Estimate gene body methylation level - Fig 5F left

```{r  fig2, fig.height = 3, fig.width = 1.5}
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")

geneBody_methLevel_all <- geneBody_full_methData %>% select(-c(group,strand)) %>% #sample_n(6000) %>%
  pivot_wider(names_from=class,values_from= c(CpG_N,methylation)) %>% 
  select(gene,methylation_Cov9hrs:methylation_Cov24hrs) %>%
  melt(id.vars="gene",value.name = "Methylation") %>% #tibble() %>% 
  ggplot(aes(x=variable,y=as.numeric(Methylation),fill=variable)) + geom_boxplot() +
  scale_x_discrete(labels= c("9 hrs","11-12 hrs","24-60 hrs")) +  
  scale_y_break(c(30, 95), scales = 0.1, ticklabels=c(95, 100)) + #scale_x_break(c(40,90), scales=2) +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position="none",
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),
        axis.ticks.x.top = element_blank(),
        axis.line.x.top = element_blank()) + xlab(NULL) + ylab("Gene body methylation %")

suppressWarnings(grid.arrange(geneBody_methLevel_all))
#tiff("geneBody_methLevel_all.tiff", width = 1.5, height = 3, units = "in",res = 400,compression = "lzw")
#geneBody_methLevel_all
#dev.off()
```


Merge and summarise gene body (first 2500 bp) methylation data
 
```{r}
geneBody_up2500_methData <- dplyr::bind_rows(list(s1_covCpG_count_first2500bp,s2_covCpG_count_first2500bp,s3_covCpG_count_first2500bp,
                                                  s1_mCpG_count_first2500bp,s2_mCpG_count_first2500bp,s3_mCpG_count_first2500bp), .id = 'class') %>% 
  mutate(class = recode(class, '1' = 'Cov9hrs', '2' = 'Cov12hrs', '3' = 'Cov24hrs',
                        '4' = 'mCpG9hrs', '5' = 'mCpG12hrs', '6' = 'mCpG24hrs'))
colnames(geneBody_up2500_methData) <- c("class","gene","group","strand","CpG_N","methylation")
geneBody_up2500_methData %>% head()
```

Estimate percentage of gene body (first 2500 bp) methylated CpGs (mCpG) - Fig 5E right

```{r  fig3, fig.height = 3, fig.width = 1.5}
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")

geneBody2500_mcpg_perc <- geneBody_up2500_methData %>%  select(-c(group,strand)) %>%
  pivot_wider(names_from=class,values_from= c(CpG_N,methylation)) %>% 
  mutate(mCpG_perc9hr= (CpG_N_mCpG9hrs/CpG_N_Cov9hrs)*100,
         mCpG_perc12hr= (CpG_N_mCpG12hrs/CpG_N_Cov12hrs)*100,
         mCpG_perc24hr= (CpG_N_mCpG24hrs/CpG_N_Cov24hrs)*100) %>% select(gene,mCpG_perc9hr,mCpG_perc12hr,mCpG_perc24hr) %>%
  melt(id.vars="gene",value.name = "mCpG_perc") %>% 
  ggplot(aes(x=variable,y=mCpG_perc,fill=variable)) + geom_boxplot(outlier.colour = "grey",size=0.4) +
  scale_x_discrete(labels= c("9 hrs","11-12 hrs","24-60 hrs")) +
  scale_y_break(c(40, 95), scales = 0.1, ticklabels=c(95, 100)) + #scale_x_break(c(40,90), scales=2) +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position="none",
        axis.text.x.top = element_blank(),
        axis.ticks.x.top = element_blank(),
        axis.line.x.top = element_blank(),
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12)) + xlab(NULL) + ylab(NULL)

suppressWarnings(grid.arrange(geneBody2500_mcpg_perc))

#tiff("geneBody2500_mcpg_perc.tiff", width = 1.5, height = 3, units = "in",res = 400,compression = "lzw")
#geneBody2500_mcpg_perc
#dev.off()
```

Estimate gene body (first 2500 bp) methylation level - Fig 5F right

```{r  fig4, fig.height = 3, fig.width = 1.5}
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")

geneBody2500_methLevel_all <- geneBody_up2500_methData %>% select(-c(group,strand)) %>% #sample_n(6000) %>%
  pivot_wider(names_from=class,values_from= c(CpG_N,methylation)) %>% 
  select(gene,methylation_Cov9hrs:methylation_Cov24hrs) %>%
  melt(id.vars="gene",value.name = "Methylation") %>% #tibble() %>% 
  ggplot(aes(x=variable,y=as.numeric(Methylation),fill=variable)) + geom_boxplot() +
  scale_x_discrete(labels= c("9 hrs","11-12 hrs","24-60 hrs")) +  
  scale_y_break(c(30, 95), scales = 0.1, ticklabels=c(95, 100)) + #scale_x_break(c(40,90), scales=2) +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon")) + 
  theme(legend.title=element_blank(),axis.text.x = element_text(size = 10,angle =30,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position="none",
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),
        axis.ticks.x.top = element_blank(),
        axis.line.x.top = element_blank()) + xlab(NULL) +  ylab("Gene body methylation %")

suppressWarnings(grid.arrange(geneBody2500_methLevel_all))

#tiff("geneBody2500_methLevel_all.tiff", width = 1.5, height = 3, units = "in",res = 400,compression = "lzw")
#geneBody2500_methLevel_all
#dev.off()
```

Covered CpGs per gene - Fig S13E

```{r}
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
geneBody_Cov_N_all_density <- geneBody_full_methData %>% select(-c(group,strand)) %>% 
  filter(grepl('Cov',class)) %>% mutate(class=gsub("Cov","",class)) %>% mutate(class= factor(class,levels = c("9hrs","12hrs","24hrs"))) %>%
  ggplot(aes(x=as.numeric(CpG_N),fill= `class`)) + geom_density(alpha=0.5) + xlim(NA,500) + scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon"),labels= c("9 hrs","11-12 hrs","24-60 hrs")) +
  geom_vline(aes(xintercept = 10), color="chocolate4", linetype="dashed", size=0.5) + theme_bw() +
  theme(legend.title=element_blank(),legend.background = element_rect(fill='transparent'),
        axis.text.x = element_text(size = 10,angle =0,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position=c(0.85,0.8),
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),
        axis.ticks.x.top = element_blank(),
        axis.line.x.top = element_blank()) + ylab("Density") + xlab("Covered CpGs per gene")

#tiff("geneBody_Cov_N_all_density.tiff", width = 3.5, height = 3, units = "in",res = 400,compression = "lzw")
geneBody_Cov_N_all_density
#dev.off()
```


Pattern of gene methylation per stage - Fig 5G

```{r}
#setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
geneBody_methLevel_all_density <- geneBody_full_methData %>% select(-c(group,strand)) %>% 
  filter(grepl('Cov',class)) %>% mutate(class=gsub("Cov","",class)) %>% mutate(class= factor(class,levels = c("9hrs","12hrs","24hrs"))) %>%
  ggplot(aes(x=as.numeric(methylation),fill= `class`)) + geom_density(alpha=0.5) +
  scale_x_continuous(trans = log10_trans(),
                     breaks = c(0,0.01,0.1,1,3,10,100),
                     labels = label_number(accuracy = 1)) + scale_y_continuous(expand = c(0,0)) +
  scale_fill_manual(values=c("orange2","mistyrose4","maroon"),labels= c("9 hrs","11-12 hrs","24-60 hrs")) +
  geom_vline(aes(xintercept = 3), color="chocolate4", linetype="dashed", size=0.5) + theme_bw() +
  theme(legend.title=element_blank(),legend.background = element_rect(fill='transparent'),
        axis.text.x = element_text(size = 10,angle =0,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10),legend.position=c(0.15,0.8),
        axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),
        axis.ticks.x.top = element_blank(),
        axis.line.x.top = element_blank()) + ylab("Density") + xlab("Gene methylation %")


#tiff("geneBody_methLevel_all_density.tiff", width = 3.5, height = 3, units = "in",res = 400,compression = "lzw")
geneBody_methLevel_all_density
#dev.off()

```
# Identify and quantify methylated genes per stage  - Fig 5H

```{r}
setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")

methylatedGenes_9hrs <- geneBody_full_methData %>% filter(class == "Cov9hrs" & CpG_N >= 5 & methylation >= 3) %>% pull(gene) 
methylatedGenes_12hrs <- geneBody_full_methData %>% filter(class == "Cov12hrs" & CpG_N >= 5 & methylation >= 3) %>% pull(gene)
methylatedGenes_24hrs <- geneBody_full_methData %>% filter(class == "Cov24hrs" & CpG_N >= 5 & methylation >= 3) %>% pull(gene)

Reduce(intersect,list(methylatedGenes_9hrs,methylatedGenes_12hrs,methylatedGenes_24hrs)) %>% unique() %>% length()

#Plot venn diagram intersecting methylated genes among stages

venn.diagram(
  x = list(methylatedGenes_9hrs,methylatedGenes_12hrs,methylatedGenes_24hrs),
  category.names = c("9 hrs" , "11-12 hrs","24-60 hrs"),
  filename = 'methylatedGenes_overlapp.tiff',
  height = 4, width = 4, resolution = 300, imagetype = "tiff", 
  units = "in", compression = "lzw", fill= c("orange2","mistyrose4","maroon" ),
  output=T, scaled=T,print.mode=c("raw","percent"))
  
```

Visualise venn diagrame in the notebook

```{r fig5, fig.height = 3, fig.width = 3}
plt = venn.diagram(
  x = list(methylatedGenes_9hrs,methylatedGenes_12hrs,methylatedGenes_24hrs),
  category.names = c("9 hrs" , "11-12 hrs","24-60 hrs"),
  filename = NULL,
  height = 4, width = 4, resolution = 300, imagetype = "tiff", 
  units = "in", compression = "lzw", fill= c("orange2","mistyrose4","maroon" ),
  output=T, scaled=T,print.mode=c("raw","percent"))
grid.newpage()
grid::grid.draw(plt)
```
# Relationship between gene methylation and expression levels
For each methylome dataset (0-9,11-12 and 24-60 hrs) we analysed the expression levels in three subsequent embryonic stages. Expression was categorised as absent (TPM = 0), low(TPM <=1), moderate (TPM <=5) and high (TPM >=5)


Merge Methyl and expression data

```{r}
embryo_exonCounts_polyA_tpm <- read.table("/drives/ssd1/manuel/phaw/2022_analysis/mzt_analysis/embryo_exonCounts_polyA_tpm.txt",header = T,stringsAsFactors = F)
colnames(embryo_exonCounts_polyA_tpm) <- c("gene_id","0-9hrs","11hrs","12hrs","14hrs","16hrs","20hrs","24hrs","24-60hrs")

embryo_Exp_Methyl_data <- merge(embryo_exonCounts_polyA_tpm,
geneBody_full_methData %>% select(-c(group,strand)) %>% 
  filter(grepl('Cov',class)) %>% mutate(class=gsub("Cov","",class)) %>% 
  mutate(class= factor(class,levels = c("9hrs","12hrs","24hrs"))) %>% 
  pivot_wider(names_from=class,values_from= c(CpG_N,methylation)),by.x = "gene_id",by.y = "gene")

embryo_Exp_Methyl_data
```

Gene expression at 11, 12 and 13-14 hrs embryos based on methylation from 0-9:15 hrs embryos - Fig 5I, left


```{r fig6, fig.height = 3, fig.width = 2.5}
get_my_colors <- colorRampPalette(brewer.pal(6, "Dark2"))
mycolors <- get_my_colors(8)

# Correlate 9hrs methylation with all expression timepoints - Boxplots
ExpMethyl_cor_9hrs <- embryo_Exp_Methyl_data %>% select(-c(`CpG_N_9hrs`:`CpG_N_24hrs`,`methylation_12hrs`,`methylation_24hrs`)) %>% 
  pivot_longer(cols = -c(gene_id,methylation_9hrs),names_to = "exp_stage",values_to="exp_count") %>% 
  mutate(exp_stage=factor(exp_stage,levels = c("gene_id","0-9hrs","11hrs","12hrs","14hrs","16hrs","20hrs","24hrs","24-60hrs"))) %>%
  mutate(exp_cat= ifelse(exp_count == 0, "0",
                         ifelse(between(exp_count,0,1),"≤ 1",
                                ifelse(between(exp_count,1,5),"≤ 5","≥ 5")))) %>% 
  filter(exp_stage %in% c("11hrs","12hrs","14hrs")) %>%  # filter(gene_id %in% methylatedGenes_9hrs) %>%
  mutate(exp_cat=factor(exp_cat,levels = c("0","≤ 1","≤ 5","≥ 5"))) %>%
  ggplot(aes(fill=exp_stage,x=exp_cat,y=methylation_9hrs)) + geom_boxplot(outlier.colour = "grey",outlier.size = 0.4,alpha=0.6)+
  ylim(NA,20) + # to print in notebook
  #scale_y_break(c(20, 95), scales = 0.01, ticklabels=c(95, 100)) + # to save figure
  scale_fill_manual(values=c("firebrick3","darkslategray","dodgerblue")) + 
  geom_hline(aes(yintercept = 3), color="chocolate4", linetype="dashed", size=0.5) +
  geom_hline(aes(yintercept = 3), color="chocolate4", linetype="dashed", size=0.5) +
  theme(legend.title=element_blank(),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=6),
        legend.position="top",legend.justification="right",legend.margin=margin(0,0,0,0),#legend.box.margin=margin(-20,-100,-10,-10),
        axis.text.x = element_text(size = 10,angle =0,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10), axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),axis.ticks.x.top = element_blank(),axis.line.x.top = element_blank()) + 
  ylab("Gene methylation % - 9hrs")  + xlab(NULL) 

#tiff("ExpMethyl_cor_9hrs.tiff", width = 2.5, height = 3.5, units = "in",res = 300,compression = "lzw")
#ExpMethyl_cor_9hrs
#dev.off()
suppressWarnings(grid.arrange(ExpMethyl_cor_9hrs))
```
Gene expression at 12, 14 and 16 hrs embryos based on methylation from 11-12 hrs embryos - Fig 5I, middle

```{r fig7, fig.height = 3, fig.width = 2.5}
ExpMethyl_cor_12hrs <- embryo_Exp_Methyl_data %>% select(-c(`CpG_N_9hrs`:`CpG_N_24hrs`,`methylation_9hrs`,`methylation_24hrs`)) %>% 
  pivot_longer(cols = -c(gene_id,methylation_12hrs),names_to = "exp_stage",values_to="exp_count") %>% 
  mutate(exp_stage=factor(exp_stage,levels = c("gene_id","0-9hrs","11hrs","12hrs","14hrs","16hrs","20hrs","24hrs","24-60hrs"))) %>%
  mutate(exp_cat= ifelse(exp_count == 0, "0",
                         ifelse(between(exp_count,0,1),"≤ 1",
                                ifelse(between(exp_count,1,5),"≤ 5","≥ 5")))) %>% 
  filter(exp_stage %in% c("12hrs","14hrs","16hrs")) %>%
  mutate(exp_cat=factor(exp_cat,levels = c("0","≤ 1","≤ 5","≥ 5"))) %>%
  ggplot(aes(fill=exp_stage,x=exp_cat,y=methylation_12hrs)) + geom_boxplot(outlier.colour = "grey",outlier.size = 0.4,alpha=0.6)+
  ylim(NA,20) + # to print in notebook
  #scale_y_break(c(20, 95), scales = 0.01, ticklabels=c(95, 100)) + # to save figure
  scale_fill_manual(values=c("darkslategray","dodgerblue","darkorange1")) +  #scale_y_continuous(expand = c(0,0)) +
  geom_hline(aes(yintercept = 3), color="chocolate4", linetype="dashed", size=0.5) +
  geom_hline(aes(yintercept = 3), color="chocolate4", linetype="dashed", size=0.5) +
  theme(legend.title=element_blank(),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=6),
        legend.position="top",legend.justification="right",legend.margin=margin(0,0,0,0),#legend.box.margin=margin(-20,-100,-10,-10),
        axis.text.x = element_text(size = 10,angle =0,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10), axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),axis.ticks.x.top = element_blank(),axis.line.x.top = element_blank()) + 
  ylab("Gene methylation % - 11-12 hrs")  + xlab(NULL)

# tiff("ExpMethyl_cor_12hrs.tiff", width = 2.5, height = 3.5, units = "in",res = 300,compression = "lzw")
# ExpMethyl_cor_12hrs
# dev.off()
suppressWarnings(grid.arrange(ExpMethyl_cor_12hrs))
```

Gene expression at 20 and 24-60 hrs embryos based on methylation from 24-60 hrs embryos - Fig 5I, right

```{r fig8, fig.height = 3, fig.width = 2.5}

ExpMethyl_cor_24hrs <- embryo_Exp_Methyl_data %>% select(-c(`CpG_N_9hrs`:`CpG_N_24hrs`,`methylation_9hrs`,`methylation_12hrs`)) %>% 
  pivot_longer(cols = -c(gene_id,methylation_24hrs),names_to = "exp_stage",values_to="exp_count") %>% 
  mutate(exp_stage=factor(exp_stage,levels = c("gene_id","0-9hrs","11hrs","12hrs","14hrs","16hrs","20hrs","24hrs","24-60hrs"))) %>%
  mutate(exp_cat= ifelse(exp_count == 0, "0",
                         ifelse(between(exp_count,0,1),"≤ 1",
                                ifelse(between(exp_count,1,5),"≤ 5","≥ 5")))) %>% 
  filter(exp_stage %in% c("24hrs","24-60hrs")) %>%
  mutate(exp_cat=factor(exp_cat,levels = c("0","≤ 1","≤ 5","≥ 5"))) %>%
  ggplot(aes(fill=exp_stage,x=exp_cat,y=methylation_24hrs)) + geom_boxplot(outlier.colour = "grey",outlier.size = 0.4,alpha=0.6)+ 
  ylim(NA,20) + # to print in notebook
  #scale_y_break(c(20, 95), scales = 0.01, ticklabels=c(95, 100)) + # to save figure
  scale_fill_manual(values=c("forestgreen","gainsboro","darkorange1")) +  #scale_y_continuous(expand = c(0,0)) +
  geom_hline(aes(yintercept = 3), color="chocolate4", linetype="dashed", size=0.5) +
  geom_hline(aes(yintercept = 3), color="chocolate4", linetype="dashed", size=0.5) +
  theme(legend.title=element_blank(),legend.key.size = unit(0.5, 'cm'),legend.text = element_text(size=6),
        legend.position="top",legend.justification="right",legend.margin=margin(0,0,0,0),#legend.box.margin=margin(-20,-100,-10,-10),
        axis.text.x = element_text(size = 10,angle =0,vjust = 0.65,hjust = 0.7),
        axis.text.y = element_text(size = 10), axis.title = element_text(size = 14),strip.text.x = element_text(size = 12),
        axis.text.x.top = element_blank(),axis.ticks.x.top = element_blank(),axis.line.x.top = element_blank()) + 
  ylab("Gene methylation % - 24-60 hrs") + xlab(NULL)

# tiff("ExpMethyl_cor_24hrs.tiff", width = 2.5, height = 3.5, units = "in",res = 300,compression = "lzw")
# ExpMethyl_cor_24hrs
# dev.off()
suppressWarnings(grid.arrange(ExpMethyl_cor_24hrs))
```
## Methylation for different developmental gene categories - Fig 5J

11-12 hrs methylation

```{r fig9, fig.height = 6, fig.width = 4}
# setwd("/drives/ssd1/manuel/phaw/2022_analysis/wgbs_analysis/")
gene_categories <- read.table("/drives/ssd1/manuel/phaw/2022_analysis/mzt_analysis/dev_genes_categories.csv",header = T,stringsAsFactors = F,sep=",")

my_comparisons = list(c("Minor wave all","Minor wave zygotic"), c("Maternal","Minor wave all"),
                      c("Major wave all","Major wave zygotic"),c("Maternal","Minor wave zygotic"))#,c("Maternal","Major wave zygotic"))

ZGA_methylation_12h  <- geneBody_full_methData %>% #replace(is.na(.), 0) %>%
  mutate(Class = ifelse(gene %in% gene_categories$pz_minor_wave,"Minor wave zygotic",
                      ifelse(gene %in% gene_categories$minor_wave_allGenes,"Minor wave all",
                               ifelse(gene %in% gene_categories$pz_major_wave,"Major wave zygotic", 
                                      ifelse(gene %in% gene_categories$maternal_genes_noZGA,"Maternal",
                                             ifelse(gene %in% gene_categories$major_wave_allGenes,"Major wave all","rest")))))) %>% 
  mutate(Class = factor(Class,levels = c("Maternal","Minor wave all","Minor wave zygotic","Major wave all","Major wave zygotic"))) %>%
  filter(Class != "rest",class== "Cov12hrs") %>% #group_by(Class) %>% summarise(meanss=mean(tx_ln))
  ggplot(aes(x=Class, y=methylation,fill=Class),size=0.3) + geom_jitter(size=0.3, alpha=0.3,color="grey") + geom_violin(alpha=0.5) + theme_minimal() +
  ylim(NA,30)+
  #scale_y_break(c(30, 60), scales = 0.1, ticklabels=c(60, 100)) +# scale_y_continuous(expand = c(0, 50)) +
  xlab(NULL) + ylab("Gene methylation - 11-12 hrs") +
  stat_summary(fun=median, geom="point", shape=23, size=1.5, color="black", fill="red",alpha=1)+
  scale_fill_manual(name=" ", values=c("navyblue", "#2B83BA", "orange","firebrick4","seagreen4")) +
  theme(axis.text.x = element_text(angle = 20, vjust = 0.9, hjust=0.75,size=10),
        axis.text.y = element_text(size=8), axis.title.x=element_blank(),legend.position="none",
        axis.title.y = element_text(size = 14),
        axis.text.y.right = element_blank(),
        axis.line.y.right = element_blank(),
        axis.ticks.y.right = element_blank())# + stat_compare_means(comparisons = my_comparisons,label = "p.signif",size=2.5,vjust = 0)# + stat_compare_means(label.y = 8, label.x=1.5, size=5) 
#ZGA_methylation_12h

# tiff("ZGA_methylation_12h.tiff", width = 3, height = 3.5, units = "in",res = 800,compression = "lzw")
# ZGA_methylation_12h
# dev.off()
suppressWarnings(grid.arrange(ZGA_methylation_12h))

```
24-60 hrs methylation

```{r fig10, fig.height = 6, fig.width = 4}
ZGA_methylation_24_60h  <- geneBody_full_methData %>% #replace(is.na(.), 0) %>%
  mutate(Class = ifelse(gene %in% gene_categories$pz_minor_wave,"Minor wave zygotic",
                      ifelse(gene %in% gene_categories$minor_wave_allGenes,"Minor wave all",
                               ifelse(gene %in% gene_categories$pz_major_wave,"Major wave zygotic", 
                                      ifelse(gene %in% gene_categories$maternal_genes_noZGA,"Maternal",
                                             ifelse(gene %in% gene_categories$major_wave_allGenes,"Major wave all","rest")))))) %>% 
  mutate(Class = factor(Class,levels = c("Maternal","Minor wave all","Minor wave zygotic","Major wave all","Major wave zygotic"))) %>%
  filter(Class != "rest",class== "Cov24hrs") %>% #group_by(Class) %>% summarise(meanss=mean(tx_ln))
  ggplot(aes(x=Class, y=methylation,fill=Class),size=0.3) + geom_jitter(size=0.3, alpha=0.3,color="grey") + geom_violin(alpha=0.5) + theme_minimal() +
  ylim(NA,30)+
  #scale_y_break(c(30, 60), scales = 0.1, ticklabels=c(60, 100)) +# scale_y_continuous(expand = c(0, 50)) +
  xlab(NULL) + ylab("Gene methylation - 24-60 hrs") +
  stat_summary(fun=median, geom="point", shape=23, size=1.5, color="black", fill="red",alpha=1)+
  scale_fill_manual(name=" ", values=c("navyblue", "#2B83BA", "orange","firebrick4","seagreen4")) +
  theme(axis.text.x = element_text(angle = 20, vjust = 0.9, hjust=0.75,size=10),
        axis.text.y = element_text(size=8), axis.title.x=element_blank(),legend.position="none",
        axis.title.y = element_text(size = 14),
        axis.text.y.right = element_blank(),
        axis.line.y.right = element_blank(),
        axis.ticks.y.right = element_blank())# + stat_compare_means(comparisons = my_comparisons,label = "p.signif",size=2.5,vjust = 0)# + stat_compare_means(label.y = 8, label.x=1.5, size=5) 

suppressWarnings(grid.arrange(ZGA_methylation_24_60h))
```

